我有一个非常大的矩阵,我想创建一个矩阵列表,将大矩阵的每个列向量转换为具有(相同)指定维度(当然与向量长度兼容)的矩阵。
目前我没有发现比使用 apply() 更快的方法。 因为我必须这样做数百次,所以我需要加快这段代码的速度。
library(Rfast)
bigmat <- matrnorm(25200, 9000)
iwant <- apply(bigmat,2,function(x) matrix(x,ncol=9), simplify=FALSE)
length(iwant)
lapply(iwant, dim)
你有什么建议吗?
也许数组比矩阵列表更舒服。那么这应该快 >5 倍:
bigarray <- array(bigmat, dim=c(2800,9,9000))
您可以通过索引第三维来遍历所需的矩阵:
bigarray[,,14]
如果确实需要,您可以将其转换为列表:
biglist <- asplit(bigarray, 3)
我们可以像下面那样使用
lapply
lapply(
seq.int(ncol(m <- matrix(bigmat, nrow = nrow(bigmat) / 9)) / 9) - 1,
function(k) m[, 9 * k + (1:9)]
)
基准测试结果显示(借用@GKi的基准代码)
bench::mark(
check = TRUE,
"apply" = apply(bigmat, 2, function(x) matrix(x, ncol = 9), simplify = F),
"asplit" = lapply(asplit(bigmat, 2), matrix, ncol = 9),
"asDF" = lapply(unname(as.data.frame(bigmat)), matrix, ncol = 9),
"Rcpp" = m2l(bigmat),
"lapply" = lapply(
seq.int(ncol(m <- matrix(bigmat, nrow = nrow(bigmat) / 9)) / 9) - 1,
function(k) m[, 9 * k + (1:9)]
)
)
# A tibble: 5 × 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <
dbl> <int> <dbl> <bch:tm>
1 apply 310.2µs 446.8µs 2107. 639KB 36.8 744 13 353ms
2 asplit 350.1µs 445.5µs 2040. 817KB 42.7 717 15 352ms
3 asDF 279.7µs 391.8µs 2378. 459KB 29.8 879 11 370ms
4 Rcpp 38.8µs 59.3µs 15432. 185KB 72.3 4482 21 290ms
5 lapply 139µs 164.9µs 5505. 374KB 58.3 1700 18 309ms
# ℹ 4 more variables: result <list>, memory <list>, time <list>, gc <list>
不幸的是
asplit
并不快。
L <- lapply(asplit(bigmat, 2), matrix, ncol=9)
另一种选择是使用
as.data.frame
.
lapply(unname(as.data.frame(bigmat)), matrix, ncol=9)
因为两者都没有显示出 Rcpp 版本的改进。
Rcpp::cppFunction('Rcpp::List m2l(Rcpp::NumericMatrix A) {
Rcpp::List output(A.ncol());
for(int i = 0; i < A.ncol(); ++i) {
output[i] = Rcpp::NumericMatrix(A.nrow()/9, 9, A.column(i).begin());
}
return output;
}')
m2l(bigmat)
Rcpp::cppFunction('Rcpp::List m2lB(Rcpp::NumericMatrix A) {
Rcpp::List output(A.ncol());
for(int i = 0; i < A.ncol(); ++i) {
Rcpp::NumericMatrix B = Rcpp::no_init(A.nrow()/9, 9);
for(int j = 0; j < A.nrow(); ++j) {
B[j] = A(j, i);
}
output[i] = B;
}
return output;
}')
m2lB(bigmat)
set.seed(0)
bigmat <- matrix(rnorm(252*90),252)
bench::mark(check=TRUE,
"apply" = apply(bigmat,2,function(x) matrix(x,ncol=9),simplify=F),
"asplit" = lapply(asplit(bigmat, 2), matrix, ncol=9),
"asDF" = lapply(unname(as.data.frame(bigmat)), matrix, ncol=9),
"Rcpp" = m2l(bigmat),
"RcppB" = m2lB(bigmat),
"array" = `attributes<-`(asplit(`dim<-`(bigmat, c(dim(bigmat)[[1]]/9L, 9L, dim(bigmat)[[2]])), 3), NULL),
"lapply" = lapply(
seq.int(ncol(m <- matrix(bigmat, nrow = nrow(bigmat) / 9)) / 9) - 1,
function(k) m[, 9 * k + (1:9)]
)
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 apply 268.4µs 287.5µs 3416. 638KB 45.8 1269 17 371ms
#2 asplit 320.8µs 364.8µs 2529. 817KB 43.5 1046 18 414ms
#3 asDF 260.8µs 278.7µs 3476. 459KB 32.8 1484 14 427ms
#4 Rcpp 31.2µs 43.1µs 22611. 185KB 81.4 9165 33 405ms
#5 RcppB 29.4µs 36.9µs 26443. 185KB 98.2 9963 37 377ms
#6 array 261.9µs 283.8µs 3446. 812KB 57.2 1266 21 367ms
#7 lapply 138µs 156µs 6342. 402KB 51.4 2714 22 428ms
Rcpp 版本比原来的
apply
变体快大约 6 倍。