按组多次重新整理列并将结果放入新列中

问题描述 投票:0回答:3

我有以下假设数据:

district <- c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,3,3)                                       
village <- c(1,2,3,4,1,2,3,4,5,1,2,3,4,5,6,7)                              
status <- c(1,0,1,0,1,1,1,0,0,1,1,1,1,0,0,0)
datei <- data.table(district, village, status) 

我想根据地区重新调整状态并将结果放入新的列中。我知道如何使用以下代码完成一次:

datei[, randomstat := sample(status), district]

现在,我想重新调整状态 1000 次并将结果放入新列中。我尝试了以下代码:

n <- 1000
datei[, paste0("randomstat", 1:n) := replicate(n, list(sample(status), district))]

但是失败了。有人可以帮我弄这个吗?谢谢你。

r random data.table shuffle group
3个回答
2
投票

您的代码中有一个拼写错误:

district
放在
list
里面,即
list(sample(status), district))
,这是不正确的。


您有以下选择:

  • 使用
    simplify = FALSE
    时启用
    replicate
    ,例如,
datei[, paste0("randomstat", 1:n) := replicate(n, sample(status), simplify = FALSE), district]
  • 或者,用
    sample(status)
     包裹 
    list()
datei[, paste0("randomstat", 1:n) := replicate(n, list(sample(status))), district]

2
投票

或者您可以创建自己的随机播放函数,以下是 10 列的示例:

n <- 10

reshuffle <- function(status, district) {
  unlist(mapply(function(s, d) sample(s), 
                split(status, district), 
                split(district, district), 
                SIMPLIFY = FALSE))
}

datei[, (paste0("randomstat", 1:n)) := lapply(1:n, \(i) shuffle(status, district))]

datei
   district village status randomstat1 randomstat2 randomstat3 randomstat4 randomstat5 randomstat6 randomstat7 randomstat8 randomstat9 randomstat10
 1:        1       1      1           1           1           1           1           1           0           1           1           1            1
 2:        1       2      0           0           0           0           0           0           0           0           0           0            0
 3:        1       3      1           1           0           0           1           1           1           0           0           0            1
 4:        1       4      0           0           1           1           0           0           1           1           1           1            0
 5:        2       1      1           1           1           1           0           0           1           1           1           1            1
 6:        2       2      1           0           0           1           0           1           0           0           1           0            0
 7:        2       3      1           1           1           0           1           1           0           1           1           1            1
 8:        2       4      0           0           1           0           1           1           1           1           0           1            0
 9:        2       5      0           1           0           1           1           0           1           0           0           0            1
10:        3       1      1           0           0           0           1           1           0           1           1           1            1
11:        3       2      1           1           0           0           1           1           1           1           0           1            1
12:        3       3      1           0           0           0           0           1           1           1           1           1            0
13:        3       4      1           1           1           1           0           0           1           1           0           0            0
14:        3       5      0           0           1           1           1           0           0           0           1           0            1
15:        3       6      0           1           1           1           0           1           1           0           1           1            1
16:        3       7      0           1           1           1           1           0           0           0           0           0            0

0
投票

如果性能是一个问题,使用

Rfast
包的一些矢量化选项:

library(Rfast)

n <- 1e3

microbenchmark::microbenchmark(
  replicate = dt[,paste0("randomstat", 1:n) := replicate(n, sample(status), FALSE), district],
  reshuffle = dt[,paste0("randomstat", 1:n) := lapply(1:n, \(i) reshuffle(status, district))],
  colShuffle = dt[,paste0("randomstat", 1:n) := as.data.frame(colShuffle(matrix(rep(status, n), .N, n))), district],
  colRanks = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n))], .N, n)), district],
  colRanksAll = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n) + match(district, unique(district)))], .N, n))],
  setup = {dt <- copy(datei)}
)
#> Unit: milliseconds
#>         expr      min        lq       mean    median        uq      max neval
#>    replicate  15.1496  17.55985  21.687518  20.56755  24.95855  37.4373   100
#>    reshuffle 120.1781 135.82605 153.436604 146.41200 160.21505 242.8556   100
#>   colShuffle   4.5537   4.94215   6.118132   5.11265   5.82665  19.3036   100
#>     colRanks   5.7100   6.08095   7.730242   6.40530   8.20650  19.7137   100
#>  colRanksAll   5.0109   5.35780   7.171746   5.59560   8.67885  17.7488   100

对更大的数据集进行计时:

set.seed(2037213561)

datei <- data.table(district = rep(1:1e3, sample(10, 1e3, 1)))[
  ,`:=`(village = rowid(district), status = runif(.N)%/%0.5)
]

microbenchmark::microbenchmark(
  replicate = dt[,paste0("randomstat", 1:n) := replicate(n, sample(status), FALSE), district],
  reshuffle = dt[,paste0("randomstat", 1:n) := lapply(1:n, \(i) reshuffle(status, district))],
  colShuffle = dt[,paste0("randomstat", 1:n) := as.data.frame(colShuffle(matrix(rep(status, n), .N, n))), district],
  colRanks = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n))], .N, n)), district],
  colRanksAll = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n) + match(district, unique(district)))], .N, n))],
  setup = {dt <- copy(datei)},
  times = 10
)
#> Unit: milliseconds
#>         expr       min        lq      mean     median         uq        max neval
#>    replicate 5602.0342 6649.0791 7237.3350  7318.1749  7641.8868  8434.8158    10
#>    reshuffle 7348.9950 8237.6315 9946.4001 10728.3559 11320.5539 11680.5648    10
#>   colShuffle 1521.4851 1578.3194 2024.9663  2181.7678  2312.6861  2368.4342    10
#>     colRanks 2063.0544 2633.7768 2784.5394  2868.1073  3097.8065  3232.0499    10
#>  colRanksAll  374.6916  403.4064  467.1745   478.0322   521.1787   580.5176    10
© www.soinside.com 2019 - 2024. All rights reserved.