我想了解是否有一种快速方法可以使用 ARM SIMD 指令进行矩阵转置(64x64 位)。
我尝试探索ARM SIMD的VTRN指令,但不确定它在这个场景中的有效应用。
输入矩阵表示为 uint64 mat[64],输出应该是按位转置。
例如,如果输入是:
0000....
1111....
0000....
1111....
预期产出:
0101....
0101....
0101....
0101....
矩阵转置的基本递归方案是将矩阵表示为块矩阵
AB
CD
首先转置 A、B、C 和 D,然后交换 B 和 C。实际上,这意味着应用一系列越来越粗略的 swizzle 步骤,首先使用按位运算,然后使用排列运算。
这可以例如实现如下:
# transpose a 64x64 bit matrix held in x0
GLOBL(xpose_asm)
FUNC(xpose_asm)
# plan of attack: use registers v16--v32 to hold
# half the array, v0--v7 for scratch. First transpose
# the two array halves individually, then swap the
# second and third quarters.
mov x4, lr
mov x2, x0
bl NAME(xpose_half)
mov x3, x0
bl NAME(xpose_half)
# final step: transpose 64x64 bit matrices
# we have to do this one in two parts as to not run
# out of registers
mov x5, x2
mov x6, x3
bl NAME(xpose_final)
bl NAME(xpose_final)
ret x4
ENDFUNC(xpose_asm)
# Transpose half a 32x64 bit matrix held in x0.
# On return, advance x0 by 32*8 = 256 byte.
FUNC(xpose_half)
# v16 holds rows 0 and 4, v17 holds 1 and 5, and so on
mov x1, x0
ld4 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64
ld4 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64
ld4 {v24.2d, v25.2d, v26.2d, v27.2d}, [x0], #64
ld4 {v28.2d, v29.2d, v30.2d, v31.2d}, [x0], #64
# macro for a transposition step. Trashes v6 and v7
.macro xpstep lo, hi, mask, shift
ushr v6.2d, \lo\().2d, #\shift
shl v7.2d, \hi\().2d, #\shift
bif \lo\().16b, v7.16b, \mask\().16b
bit \hi\().16b, v6.16b, \mask\().16b
.endm
# 1st step: transpose 2x2 bit matrices
movi v0.16b, #0x55
xpstep v16, v17, v0, 1
xpstep v18, v19, v0, 1
xpstep v20, v21, v0, 1
xpstep v22, v23, v0, 1
xpstep v24, v25, v0, 1
xpstep v26, v27, v0, 1
xpstep v28, v29, v0, 1
xpstep v30, v31, v0, 1
# 2nd step: transpose 4x4 bit matrices
movi v0.16b, #0x33
xpstep v16, v18, v0, 2
xpstep v17, v19, v0, 2
xpstep v20, v22, v0, 2
xpstep v21, v23, v0, 2
xpstep v24, v26, v0, 2
xpstep v25, v27, v0, 2
xpstep v28, v30, v0, 2
xpstep v29, v31, v0, 2
# immediate step: zip vectors to change
# colocation. As a side effect, every other
# vector is temporarily relocated to the v0..v7
# register range
zip1 v0.2d, v16.2d, v17.2d
zip2 v17.2d, v16.2d, v17.2d
zip1 v1.2d, v18.2d, v19.2d
zip2 v19.2d, v18.2d, v19.2d
zip1 v2.2d, v20.2d, v21.2d
zip2 v21.2d, v20.2d, v21.2d
zip1 v3.2d, v22.2d, v23.2d
zip2 v23.2d, v22.2d, v23.2d
zip1 v4.2d, v24.2d, v25.2d
zip2 v25.2d, v24.2d, v25.2d
zip1 v5.2d, v26.2d, v27.2d
zip2 v27.2d, v26.2d, v27.2d
zip1 v6.2d, v28.2d, v29.2d
zip2 v29.2d, v28.2d, v29.2d
zip1 v7.2d, v30.2d, v31.2d
zip2 v31.2d, v30.2d, v31.2d
# macro for the 3rd transposition step
# swap low 4 bit of each hi member with
# high 4 bit of each orig member. The orig
# members are copied to lo in the process.
.macro xpstep3 lo, hi, orig
mov \lo\().16b, \orig\().16b
sli \lo\().16b, \hi\().16b, #4
sri \hi\().16b, \orig\().16b, #4
.endm
# 3rd step: transpose 8x8 bit matrices
# special code is needed here since we need to
# swap row n row line n+4, but these rows are
# always colocated in the same register
xpstep3 v16, v17, v0
xpstep3 v18, v19, v1
xpstep3 v20, v21, v2
xpstep3 v22, v23, v3
xpstep3 v24, v25, v4
xpstep3 v26, v27, v5
xpstep3 v28, v29, v6
xpstep3 v30, v31, v7
# registers now hold
# v16: { 0, 1} v17: { 4, 5} v18: { 2, 3} v19: { 6, 7}
# v20: { 8, 9} v21: {12, 13} v22: {10, 11} v23: {14, 15}
# v24: {16, 17} v25: {20, 21} v26: {18, 19} v27: {22, 23}
# v28: {24, 25} v29: {28, 29} v30: {26, 27} v31: {30, 31}
# 4th step: transpose 16x16 bit matrices
# this step again moves half the registers to v0--v7
trn1 v0.16b, v16.16b, v20.16b
trn2 v20.16b, v16.16b, v20.16b
trn1 v1.16b, v17.16b, v21.16b
trn2 v21.16b, v17.16b, v21.16b
trn1 v2.16b, v18.16b, v22.16b
trn2 v22.16b, v18.16b, v22.16b
trn1 v3.16b, v19.16b, v23.16b
trn2 v23.16b, v19.16b, v23.16b
trn1 v4.16b, v24.16b, v28.16b
trn2 v28.16b, v24.16b, v28.16b
trn1 v5.16b, v25.16b, v29.16b
trn2 v29.16b, v25.16b, v29.16b
trn1 v6.16b, v26.16b, v30.16b
trn2 v30.16b, v26.16b, v30.16b
trn1 v7.16b, v27.16b, v31.16b
trn2 v31.16b, v27.16b, v31.16b
# 5th step: transpose 32x32 bit matrices
# while we are at it, shuffle the order of
# entries such that they are in order
trn1 v16.8h, v0.8h, v4.8h
trn2 v24.8h, v0.8h, v4.8h
trn1 v18.8h, v1.8h, v5.8h
trn2 v26.8h, v1.8h, v5.8h
trn1 v17.8h, v2.8h, v6.8h
trn2 v25.8h, v2.8h, v6.8h
trn1 v19.8h, v3.8h, v7.8h
trn2 v27.8h, v3.8h, v7.8h
trn1 v0.8h, v20.8h, v28.8h
trn2 v4.8h, v20.8h, v28.8h
trn1 v2.8h, v21.8h, v29.8h
trn2 v6.8h, v21.8h, v29.8h
trn1 v1.8h, v22.8h, v30.8h
trn2 v5.8h, v22.8h, v30.8h
trn1 v3.8h, v23.8h, v31.8h
trn2 v7.8h, v23.8h, v31.8h
# now deposit the partially transposed matrix
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x1], #64
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x1], #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x1], #64
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x1], #64
ret
ENDFUNC(xpose_half)
FUNC(xpose_final)
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x2], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x3], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x2], #64
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [x3], #64
trn1 v0.4s, v16.4s, v24.4s
trn2 v4.4s, v16.4s, v24.4s
trn1 v1.4s, v17.4s, v25.4s
trn2 v5.4s, v17.4s, v25.4s
trn1 v2.4s, v18.4s, v26.4s
trn2 v6.4s, v18.4s, v26.4s
trn1 v3.4s, v19.4s, v27.4s
trn2 v7.4s, v19.4s, v27.4s
trn1 v16.4s, v20.4s, v28.4s
trn2 v24.4s, v20.4s, v28.4s
trn1 v17.4s, v21.4s, v29.4s
trn2 v25.4s, v21.4s, v29.4s
trn1 v18.4s, v22.4s, v30.4s
trn2 v26.4s, v22.4s, v30.4s
trn1 v19.4s, v23.4s, v31.4s
trn2 v27.4s, v23.4s, v31.4s
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x5], #64
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x6], #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x5], #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x6], #64
ret
ENDFUNC(xpose_final)
我们可以看到,性能与 Lee 的方法 相比效果很好,大约快了三倍。
# Apple M1
name time/op
Ref 764ns ± 0%
Lee 102ns ± 0%
Fuz 34.7ns ± 0%
name speed
Ref 670MB/s ± 0%
Lee 5.01GB/s ± 0%
Fuz 14.7GB/s ± 0%
# Kunpeng 920
name time/op
Ref 3.73µs ± 0%
Lee 391ns ± 1%
Fuz 96.0ns ± 0%
name speed
Ref 137MB/s ± 0%
Lee 1.31GB/s ± 1%
Fuz 5.33GB/s ± 0%
# ARM Cortex A72
name time/op
Ref 8.13µs ± 0%
Lee 892ns ± 0%
Fuz 296ns ± 0%
name speed
Ref 63.0MB/s ± 0%
Lee 574MB/s ± 0%
Fuz 1.73GB/s ± 0%
# Cavium ThunderX
name time/op
Ref 19.7µs ± 0%
Lee 1.15µs ± 0%
Fuz 690ns ± 0%
name speed
Ref 25.9MB/s ± 0%
Lee 444MB/s ± 0%
Fuz 742MB/s ± 0%
可能还会有进一步的改进。例如,合适的排列掩码可以与
tbl
指令集一起使用,以一次执行多个转置步骤(尤其是步骤 3 到 5)。
请注意,该算法只需加载和写出数组两次。一次转置每个 32x32 子数组(两次调用
xpose_half
),再一次将右上角与左下角的 32x32 子数组交换。在这两种情况下,都使用最大宽度 64 字节加载和存储,将内存操作量减少到最低限度。
.arch armv8-a
.global transposeBitwise64x64_noob
.text
.balign 64
.func
// void transposeBitwise64x64_noob(uint64_t *pDst, uint64_t *pSrc);
pDst .req x0
pSrc0 .req x1
pSrc1 .req x2
pDst1 .req x3
stride .req x4
count .req w5
pDst0 .req pDst
// no "battle plan" here. not needed for such a self-explanatory cakewalk
transposeBitwise64x64_noob:
add pSrc1, pSrc0, #64
movi v6.16b, #0xcc
mov stride, #128
movi v7.16b, #0xaa
sub pDst, pDst, #32
mov count, #2
.balign 16
1:
ld4 {v16.16b, v17.16b, v18.16b, v19.16b}, [pSrc0], stride
ld4 {v20.16b, v21.16b, v22.16b, v23.16b}, [pSrc1], stride
ld4 {v24.16b, v25.16b, v26.16b, v27.16b}, [pSrc0], stride
ld4 {v28.16b, v29.16b, v30.16b, v31.16b}, [pSrc1], stride
stp q16, q20, [pDst, #32]!
subs count, count, #1
stp q17, q21, [pDst, #1*64]
stp q18, q22, [pDst, #2*64]
stp q19, q23, [pDst, #3*64]
stp q24, q28, [pDst, #4*64]
stp q25, q29, [pDst, #5*64]
stp q26, q30, [pDst, #6*64]
stp q27, q31, [pDst, #7*64]
b.ne 1b
// 8x64 matrix transpose virtually finished. What a moron needs zip1/zip2/trn for that?
nop
sub pSrc0, pDst, #32
add pSrc1, pDst, #256-32
mov count, #4
sub pDst0, pDst, #32
add pDst1, pSrc0, #256
1:
// 8x64 matrix transpose finished on-the-fly while reloading. Again, who the hell needs permutation instructions when we have ld2/ld3/ld4?
ld2 {v24.16b, v25.16b}, [pSrc0], #32
ld2 {v26.16b, v27.16b}, [pSrc1], #32
ld2 {v28.16b, v29.16b}, [pSrc0], #32
ld2 {v30.16b, v31.16b}, [pSrc1], #32
subs count, count, #1
// nosy noob shut up remark: the trns below aren't part of the matrix transpose
trn1 v16.2d, v24.2d, v25.2d // row0
trn2 v17.2d, v24.2d, v25.2d // row1
trn1 v18.2d, v26.2d, v27.2d // row2
trn2 v19.2d, v26.2d, v28.2d // row3
trn1 v20.2d, v28.2d, v29.2d // row4
trn2 v21.2d, v28.2d, v29.2d // row5
trn1 v22.2d, v30.2d, v31.2d // row6
trn2 v23.2d, v30.2d, v31.2d // row7
mov v24.16b, v16.16b
mov v25.16b, v17.16b
mov v26.16b, v18.16b
mov v27.16b, v19.16b
sli v16.16b, v20.16b, #4
sli v17.16b, v21.16b, #4
sli v18.16b, v22.16b, #4
sli v19.16b, v23.16b, #4
sri v20.16b, v24.16b, #4
sri v21.16b, v25.16b, #4
sri v22.16b, v26.16b, #4
sri v23.16b, v27.16b, #4
shl v24.16b, v18.16b, #2
shl v25.16b, v19.16b, #2
ushr v26.16b, v16.16b, #2
ushr v27.16b, v17.16b, #2
shl v28.16b, v22.16b, #2
shl v29.16b, v23.16b, #2
ushr v30.16b, v20.16b, #2
ushr v31.16b, v21.16b, #2
bit v16.16b, v24.16b, v6.16b
bit v17.16b, v25.16b, v6.16b
bif v18.16b, v26.16b, v6.16b
bif v19.16b, v27.16b, v6.16b
bit v20.16b, v28.16b, v6.16b
bit v21.16b, v29.16b, v6.16b
bif v22.16b, v30.16b, v6.16b
bif v23.16b, v31.16b, v6.16b
shl v24.16b, v17.16b, #1
ushr v25.16b, v16.16b, #1
shl v26.16b, v19.16b, #1
ushr v27.16b, v18.16b, #1
shl v28.16b, v21.16b, #1
ushr v29.16b, v20.16b, #1
shl v30.16b, v23.16b, #1
ushr v31.16b, v22.16b, #1
bit v16.16b, v24.16b, v7.16b
bif v17.16b, v25.16b, v7.16b
bit v18.16b, v26.16b, v7.16b
bif v19.16b, v27.16b, v7.16b
bit v20.16b, v28.16b, v7.16b
bif v21.16b, v29.16b, v7.16b
bit v22.16b, v30.16b, v7.16b
bif v23.16b, v31.16b, v7.16b
st4 {v16.d, v17.d, v18.d, v19.d}[0], [pDst0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[1], [pDst1], #32
st4 {v20.d, v21.d, v22.d, v23.d}[0], [pDst0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[1], [pDst1], #32
b.ne 1b
// Everyone has a plan until they get punched in the mouth - Mike Tyson
.balign 16
ret
.endfunc
.end
这可能是完美的菜鸟代码:以零延迟完美最小化。
我本来期待 fuz 能提供类似的东西,但是......
尽管如此,这毕竟是一个菜鸟版本,我会给它一个 C 级(befriedigend)。
为什么它不值得获得 A 级,将在下次更新时明确:带宽限制和功耗,这是现代多核处理器的真正问题。
Fuz 的代码(F 级,mangelhaft):
# transpose a 64x64 bit matrix held in x0
GLOBL(xpose_asm)
FUNC(xpose_asm)
# plan of attack: use registers v16--v32 to hold
# half the array, v0--v7 for scratch. First transpose
# the two array halves individually, then swap the
# second and third quarters.
mov x4, lr
mov x2, x0
bl NAME(xpose_half)
mov x3, x0
bl NAME(xpose_half)
# final step: transpose 64x64 bit matrices
# we have to do this one in two parts as to not run
# out of registers
mov x5, x2
mov x6, x3
bl NAME(xpose_final)
bl NAME(xpose_final)
ret x4
ENDFUNC(xpose_asm)
# Transpose half a 32x64 bit matrix held in x0.
# On return, advance x0 by 32*8 = 256 byte.
FUNC(xpose_half)
# v16 holds rows 0 and 4, v17 holds 1 and 5, and so on
mov x1, x0
ld4 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64
ld4 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64
ld4 {v24.2d, v25.2d, v26.2d, v27.2d}, [x0], #64
ld4 {v28.2d, v29.2d, v30.2d, v31.2d}, [x0], #64
# macro for a transposition step. Trashes v6 and v7
.macro xpstep lo, hi, mask, shift
ushr v6.2d, \lo\().2d, #\shift
shl v7.2d, \hi\().2d, #\shift
bif \lo\().16b, v7.16b, \mask\().16b
bit \hi\().16b, v6.16b, \mask\().16b
.endm
# 1st step: transpose 2x2 bit matrices
movi v0.16b, #0x55
xpstep v16, v17, v0, 1
xpstep v18, v19, v0, 1
xpstep v20, v21, v0, 1
xpstep v22, v23, v0, 1
xpstep v24, v25, v0, 1
xpstep v26, v27, v0, 1
xpstep v28, v29, v0, 1
xpstep v30, v31, v0, 1
# 2nd step: transpose 4x4 bit matrices
movi v0.16b, #0x33
xpstep v16, v18, v0, 2
xpstep v17, v19, v0, 2
xpstep v20, v22, v0, 2
xpstep v21, v23, v0, 2
xpstep v24, v26, v0, 2
xpstep v25, v27, v0, 2
xpstep v28, v30, v0, 2
xpstep v29, v31, v0, 2
# immediate step: zip vectors to change
# colocation. As a side effect, every other
# vector is temporarily relocated to the v0..v7
# register range
zip1 v0.2d, v16.2d, v17.2d
zip2 v17.2d, v16.2d, v17.2d
zip1 v1.2d, v18.2d, v19.2d
zip2 v19.2d, v18.2d, v19.2d
zip1 v2.2d, v20.2d, v21.2d
zip2 v21.2d, v20.2d, v21.2d
zip1 v3.2d, v22.2d, v23.2d
zip2 v23.2d, v22.2d, v23.2d
zip1 v4.2d, v24.2d, v25.2d
zip2 v25.2d, v24.2d, v25.2d
zip1 v5.2d, v26.2d, v27.2d
zip2 v27.2d, v26.2d, v27.2d
zip1 v6.2d, v28.2d, v29.2d
zip2 v29.2d, v28.2d, v29.2d
zip1 v7.2d, v30.2d, v31.2d
zip2 v31.2d, v30.2d, v31.2d
# macro for the 3rd transposition step
# swap low 4 bit of each hi member with
# high 4 bit of each orig member. The orig
# members are copied to lo in the process.
.macro xpstep3 lo, hi, orig
mov \lo\().16b, \orig\().16b
sli \lo\().16b, \hi\().16b, #4
sri \hi\().16b, \orig\().16b, #4
.endm
# 3rd step: transpose 8x8 bit matrices
# special code is needed here since we need to
# swap row n row line n+4, but these rows are
# always colocated in the same register
xpstep3 v16, v17, v0
xpstep3 v18, v19, v1
xpstep3 v20, v21, v2
xpstep3 v22, v23, v3
xpstep3 v24, v25, v4
xpstep3 v26, v27, v5
xpstep3 v28, v29, v6
xpstep3 v30, v31, v7
# registers now hold
# v16: { 0, 1} v17: { 4, 5} v18: { 2, 3} v19: { 6, 7}
# v20: { 8, 9} v21: {12, 13} v22: {10, 11} v23: {14, 15}
# v24: {16, 17} v25: {20, 21} v26: {18, 19} v27: {22, 23}
# v28: {24, 25} v29: {28, 29} v30: {26, 27} v31: {30, 31}
# 4th step: transpose 16x16 bit matrices
# this step again moves half the registers to v0--v7
trn1 v0.16b, v16.16b, v20.16b
trn2 v20.16b, v16.16b, v20.16b
trn1 v1.16b, v17.16b, v21.16b
trn2 v21.16b, v17.16b, v21.16b
trn1 v2.16b, v18.16b, v22.16b
trn2 v22.16b, v18.16b, v22.16b
trn1 v3.16b, v19.16b, v23.16b
trn2 v23.16b, v19.16b, v23.16b
trn1 v4.16b, v24.16b, v28.16b
trn2 v28.16b, v24.16b, v28.16b
trn1 v5.16b, v25.16b, v29.16b
trn2 v29.16b, v25.16b, v29.16b
trn1 v6.16b, v26.16b, v30.16b
trn2 v30.16b, v26.16b, v30.16b
trn1 v7.16b, v27.16b, v31.16b
trn2 v31.16b, v27.16b, v31.16b
# 5th step: transpose 32x32 bit matrices
# while we are at it, shuffle the order of
# entries such that they are in order
trn1 v16.8h, v0.8h, v4.8h
trn2 v24.8h, v0.8h, v4.8h
trn1 v18.8h, v1.8h, v5.8h
trn2 v26.8h, v1.8h, v5.8h
trn1 v17.8h, v2.8h, v6.8h
trn2 v25.8h, v2.8h, v6.8h
trn1 v19.8h, v3.8h, v7.8h
trn2 v27.8h, v3.8h, v7.8h
trn1 v0.8h, v20.8h, v28.8h
trn2 v4.8h, v20.8h, v28.8h
trn1 v2.8h, v21.8h, v29.8h
trn2 v6.8h, v21.8h, v29.8h
trn1 v1.8h, v22.8h, v30.8h
trn2 v5.8h, v22.8h, v30.8h
trn1 v3.8h, v23.8h, v31.8h
trn2 v7.8h, v23.8h, v31.8h
# now deposit the partially transposed matrix
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x1], #64
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x1], #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x1], #64
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x1], #64
ret
ENDFUNC(xpose_half)
FUNC(xpose_final)
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x2], #64
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x3], #64
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x2], #64
ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [x3], #64
trn1 v0.4s, v16.4s, v24.4s
trn2 v4.4s, v16.4s, v24.4s
trn1 v1.4s, v17.4s, v25.4s
trn2 v5.4s, v17.4s, v25.4s
trn1 v2.4s, v18.4s, v26.4s
trn2 v6.4s, v18.4s, v26.4s
trn1 v3.4s, v19.4s, v27.4s
trn2 v7.4s, v19.4s, v27.4s
trn1 v16.4s, v20.4s, v28.4s
trn2 v24.4s, v20.4s, v28.4s
trn1 v17.4s, v21.4s, v29.4s
trn2 v25.4s, v21.4s, v29.4s
trn1 v18.4s, v22.4s, v30.4s
trn2 v26.4s, v22.4s, v30.4s
trn1 v19.4s, v23.4s, v31.4s
trn2 v27.4s, v23.4s, v31.4s
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x5], #64
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x6], #64
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x5], #64
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x6], #64
ret
ENDFUNC(xpose_final)
我对其他答案进行了一些调查和阅读,并实现了一些不同的方法(在 Rust 和汇编中),并在我的 Apple M2 Max 上对它们进行了基准测试。
tl;dr 使用 NEON 指令比非 NEON 版本有 2 倍的优势。
结果:
recursive: 163 ns/iter (+/- 10)
Hacker's Delight loop: 157 ns/iter (+/- 21)
Hacker's Delight unrolled: 60 ns/iter (+/- 4)
Lees's answer: 37 ns/iter (+/- 3)
fuz's answer: 31 ns/iter (+/- 1)
“递归”算法是一种基本的递归算法,使用 NEON
ld4
指令等稍微优化,但涉及更多的内存移动。
“Hacker's Delight 循环”版本是 Hacker's Delight 的一个非常简单的实现,图 7-3(至少在第一版中),但经过修改以对 64 位单词进行位反转以匹配其他答案。它看起来像:
let mut j = 32;
let mut m = 0xffffffffu64;
while j != 0 {
let mut k = 0;
while k < 64 {
unsafe {
let a = *x.get_unchecked(k + j);
let b = *x.get_unchecked(k);
let t = (a ^ (b >> j)) & m;
*x.get_unchecked_mut(k + j) = a ^ t;
*x.get_unchecked_mut(k) = b ^ (t << j);
}
k = (k + j + 1) & !j;
}
j >>= 1;
m ^= m << j;
}
Rust/LLVM 会自动向量化一点,但不会太多(从 Rust 1.76.0 开始)。
“黑客之乐展开”版本是相同的,但手动展开循环。我没有观察到任何自动矢量化。
“李的答案”版本改编自他的第二个答案,https://stackoverflow.com/a/71653601/10524——但是,我无法让代码输出正确的答案,这可能是我的错在转录它或在原始程序集中,但我现在不会花更多时间修复它。
“fuz 的答案”版本改编自https://stackoverflow.com/a/71555778/10524——这段代码确实有效。
通过切换到汇编/内在函数以实现更有效的内存管理,以及可能使用旋转指令进行更有效的交换(正如本书甚至讨论的那样),Hacker's Delight 展开版本可能还有更多的改进空间。
Lee 的代码将转置位输出到内存的单独区域,这是其速度稍慢的原因之一。如果我们不进行复制,那么它就在福兹答案的误差范围内。如果我们就地进行转换,可能还有一些改进的空间。
fuz 的答案目前看来是赢家。
我已经在 Rust 中实现了所有这些(在适用的情况下使用内联汇编):https://github.com/swenson/binary_matrix/tree/simd-transpose/src
数据大小远远超过寄存器组的大小。您可以选择:
连续商店总是更可取。
#include <arm_neon.h>
void transposeBitwise64x64(uint64_t *pDst, uint64_t *pSrc)
{
uint8x8_t drow0, drow1, drow2, drow3, drow4, drow5, drow6, drow7;
uint8x8_t dtmp0, dtmp1, dtmp2, dtmp3, dtmp4, dtmp5, dtmp6, dtmp7;
uint8x16_t qrow0, qrow1, qrow2, qrow3, qrow4, qrow5, qrow6, qrow7;
uint8x16_t qtmp0, qtmp1, qtmp2, qtmp3, qtmp4, qtmp5, qtmp6, qtmp7;
const intptr_t sstride = 16;
uint8_t *pSrc1, *pSrc2, *pSrcBase;
uint32_t count = 8;
drow0 = vmov_n_u8(0);
drow1 = vmov_n_u8(0);
drow2 = vmov_n_u8(0);
drow3 = vmov_n_u8(0);
drow4 = vmov_n_u8(0);
drow5 = vmov_n_u8(0);
drow6 = vmov_n_u8(0);
drow7 = vmov_n_u8(0);
pSrcBase = (uint8_t *) pSrc;
do {
pSrc1 = pSrcBase;
pSrc2 = pSrcBase + 8;
pSrcBase += 1;
drow0 = vld1_lane_u8(pSrc1, drow0, 0); pSrc1 += sstride;
drow1 = vld1_lane_u8(pSrc2, drow1, 0); pSrc2 += sstride;
drow2 = vld1_lane_u8(pSrc1, drow2, 0); pSrc1 += sstride;
drow3 = vld1_lane_u8(pSrc2, drow3, 0); pSrc2 += sstride;
drow4 = vld1_lane_u8(pSrc1, drow4, 0); pSrc1 += sstride;
drow5 = vld1_lane_u8(pSrc2, drow5, 0); pSrc2 += sstride;
drow6 = vld1_lane_u8(pSrc1, drow6, 0); pSrc1 += sstride;
drow7 = vld1_lane_u8(pSrc2, drow7, 0); pSrc2 += sstride;
drow0 = vld1_lane_u8(pSrc1, drow0, 1); pSrc1 += sstride;
drow1 = vld1_lane_u8(pSrc2, drow1, 1); pSrc2 += sstride;
drow2 = vld1_lane_u8(pSrc1, drow2, 1); pSrc1 += sstride;
drow3 = vld1_lane_u8(pSrc2, drow3, 1); pSrc2 += sstride;
drow4 = vld1_lane_u8(pSrc1, drow4, 1); pSrc1 += sstride;
drow5 = vld1_lane_u8(pSrc2, drow5, 1); pSrc2 += sstride;
drow6 = vld1_lane_u8(pSrc1, drow6, 1); pSrc1 += sstride;
drow7 = vld1_lane_u8(pSrc2, drow7, 1); pSrc2 += sstride;
drow0 = vld1_lane_u8(pSrc1, drow0, 2); pSrc1 += sstride;
drow1 = vld1_lane_u8(pSrc2, drow1, 2); pSrc2 += sstride;
drow2 = vld1_lane_u8(pSrc1, drow2, 2); pSrc1 += sstride;
drow3 = vld1_lane_u8(pSrc2, drow3, 2); pSrc2 += sstride;
drow4 = vld1_lane_u8(pSrc1, drow4, 2); pSrc1 += sstride;
drow5 = vld1_lane_u8(pSrc2, drow5, 2); pSrc2 += sstride;
drow6 = vld1_lane_u8(pSrc1, drow6, 2); pSrc1 += sstride;
drow7 = vld1_lane_u8(pSrc2, drow7, 2); pSrc2 += sstride;
drow0 = vld1_lane_u8(pSrc1, drow0, 3); pSrc1 += sstride;
drow1 = vld1_lane_u8(pSrc2, drow1, 3); pSrc2 += sstride;
drow2 = vld1_lane_u8(pSrc1, drow2, 3); pSrc1 += sstride;
drow3 = vld1_lane_u8(pSrc2, drow3, 3); pSrc2 += sstride;
drow4 = vld1_lane_u8(pSrc1, drow4, 3); pSrc1 += sstride;
drow5 = vld1_lane_u8(pSrc2, drow5, 3); pSrc2 += sstride;
drow6 = vld1_lane_u8(pSrc1, drow6, 3); pSrc1 += sstride;
drow7 = vld1_lane_u8(pSrc2, drow7, 3); pSrc2 += sstride;
drow0 = vld1_lane_u8(pSrc1, drow0, 4); pSrc1 += sstride;
drow1 = vld1_lane_u8(pSrc2, drow1, 4); pSrc2 += sstride;
drow2 = vld1_lane_u8(pSrc1, drow2, 4); pSrc1 += sstride;
drow3 = vld1_lane_u8(pSrc2, drow3, 4); pSrc2 += sstride;
drow4 = vld1_lane_u8(pSrc1, drow4, 4); pSrc1 += sstride;
drow5 = vld1_lane_u8(pSrc2, drow5, 4); pSrc2 += sstride;
drow6 = vld1_lane_u8(pSrc1, drow6, 4); pSrc1 += sstride;
drow7 = vld1_lane_u8(pSrc2, drow7, 4); pSrc2 += sstride;
drow0 = vld1_lane_u8(pSrc1, drow0, 5); pSrc1 += sstride;
drow1 = vld1_lane_u8(pSrc2, drow1, 5); pSrc2 += sstride;
drow2 = vld1_lane_u8(pSrc1, drow2, 5); pSrc1 += sstride;
drow3 = vld1_lane_u8(pSrc2, drow3, 5); pSrc2 += sstride;
drow4 = vld1_lane_u8(pSrc1, drow4, 5); pSrc1 += sstride;
drow5 = vld1_lane_u8(pSrc2, drow5, 5); pSrc2 += sstride;
drow6 = vld1_lane_u8(pSrc1, drow6, 5); pSrc1 += sstride;
drow7 = vld1_lane_u8(pSrc2, drow7, 5); pSrc2 += sstride;
drow0 = vld1_lane_u8(pSrc1, drow0, 6); pSrc1 += sstride;
drow1 = vld1_lane_u8(pSrc2, drow1, 6); pSrc2 += sstride;
drow2 = vld1_lane_u8(pSrc1, drow2, 6); pSrc1 += sstride;
drow3 = vld1_lane_u8(pSrc2, drow3, 6); pSrc2 += sstride;
drow4 = vld1_lane_u8(pSrc1, drow4, 6); pSrc1 += sstride;
drow5 = vld1_lane_u8(pSrc2, drow5, 6); pSrc2 += sstride;
drow6 = vld1_lane_u8(pSrc1, drow6, 6); pSrc1 += sstride;
drow7 = vld1_lane_u8(pSrc2, drow7, 6); pSrc2 += sstride;
drow0 = vld1_lane_u8(pSrc1, drow0, 7); pSrc1 += sstride;
drow1 = vld1_lane_u8(pSrc2, drow1, 7); pSrc2 += sstride;
drow2 = vld1_lane_u8(pSrc1, drow2, 7); pSrc1 += sstride;
drow3 = vld1_lane_u8(pSrc2, drow3, 7); pSrc2 += sstride;
drow4 = vld1_lane_u8(pSrc1, drow4, 7); pSrc1 += sstride;
drow5 = vld1_lane_u8(pSrc2, drow5, 7); pSrc2 += sstride;
drow6 = vld1_lane_u8(pSrc1, drow6, 7);
drow7 = vld1_lane_u8(pSrc2, drow7, 7);
dtmp0 = vshr_n_u8(drow0, 1);
dtmp1 = vshr_n_u8(drow1, 1);
dtmp2 = vshr_n_u8(drow2, 1);
dtmp3 = vshr_n_u8(drow3, 1);
dtmp4 = vshr_n_u8(drow4, 1);
dtmp5 = vshr_n_u8(drow5, 1);
dtmp6 = vshr_n_u8(drow6, 1);
dtmp7 = vshr_n_u8(drow7, 1);
qrow0 = vcombine_u8(drow0, dtmp0);
qrow1 = vcombine_u8(drow1, dtmp1);
qrow2 = vcombine_u8(drow2, dtmp2);
qrow3 = vcombine_u8(drow3, dtmp3);
qrow4 = vcombine_u8(drow4, dtmp4);
qrow5 = vcombine_u8(drow5, dtmp5);
qrow6 = vcombine_u8(drow6, dtmp6);
qrow7 = vcombine_u8(drow7, dtmp7);
//////////////////////////////////////
qtmp0 = qrow0;
qtmp1 = qrow1;
qtmp2 = qrow2;
qtmp3 = qrow3;
qtmp4 = qrow4;
qtmp5 = qrow5;
qtmp6 = qrow6;
qtmp7 = qrow7;
qtmp0 = vsliq_n_u8(qtmp0, qtmp1, 1);
qtmp2 = vsliq_n_u8(qtmp2, qtmp3, 1);
qtmp4 = vsliq_n_u8(qtmp4, qtmp5, 1);
qtmp6 = vsliq_n_u8(qtmp6, qtmp7, 1);
qtmp0 = vsliq_n_u8(qtmp0, qtmp2, 2);
qtmp4 = vsliq_n_u8(qtmp4, qtmp6, 2);
qtmp0 = vsliq_n_u8(qtmp0, qtmp4, 4);
vst1q_u8((uint8_t *)pDst, qtmp0); pDst += 2;
//////////////////////////////////////
qtmp0 = vshrq_n_u8(qrow0, 2);
qtmp1 = vshrq_n_u8(qrow1, 2);
qtmp2 = vshrq_n_u8(qrow2, 2);
qtmp3 = vshrq_n_u8(qrow3, 2);
qtmp4 = vshrq_n_u8(qrow4, 2);
qtmp5 = vshrq_n_u8(qrow5, 2);
qtmp6 = vshrq_n_u8(qrow6, 2);
qtmp7 = vshrq_n_u8(qrow7, 2);
qtmp0 = vsliq_n_u8(qtmp0, qtmp1, 1);
qtmp2 = vsliq_n_u8(qtmp2, qtmp3, 1);
qtmp4 = vsliq_n_u8(qtmp4, qtmp5, 1);
qtmp6 = vsliq_n_u8(qtmp6, qtmp7, 1);
qtmp0 = vsliq_n_u8(qtmp0, qtmp2, 2);
qtmp4 = vsliq_n_u8(qtmp4, qtmp6, 2);
qtmp0 = vsliq_n_u8(qtmp0, qtmp4, 4);
vst1q_u8((uint8_t *)pDst, qtmp0); pDst += 2;
//////////////////////////////////////
qtmp0 = vshrq_n_u8(qrow0, 4);
qtmp1 = vshrq_n_u8(qrow1, 4);
qtmp2 = vshrq_n_u8(qrow2, 4);
qtmp3 = vshrq_n_u8(qrow3, 4);
qtmp4 = vshrq_n_u8(qrow4, 4);
qtmp5 = vshrq_n_u8(qrow5, 4);
qtmp6 = vshrq_n_u8(qrow6, 4);
qtmp7 = vshrq_n_u8(qrow7, 4);
qtmp0 = vsliq_n_u8(qtmp0, qtmp1, 1);
qtmp2 = vsliq_n_u8(qtmp2, qtmp3, 1);
qtmp4 = vsliq_n_u8(qtmp4, qtmp5, 1);
qtmp6 = vsliq_n_u8(qtmp6, qtmp7, 1);
qtmp0 = vsliq_n_u8(qtmp0, qtmp2, 2);
qtmp4 = vsliq_n_u8(qtmp4, qtmp6, 2);
qtmp0 = vsliq_n_u8(qtmp0, qtmp4, 4);
vst1q_u8((uint8_t *)pDst, qtmp0); pDst += 2;
//////////////////////////////////////
qtmp0 = vshrq_n_u8(qrow0, 6);
qtmp1 = vshrq_n_u8(qrow1, 6);
qtmp2 = vshrq_n_u8(qrow2, 6);
qtmp3 = vshrq_n_u8(qrow3, 6);
qtmp4 = vshrq_n_u8(qrow4, 6);
qtmp5 = vshrq_n_u8(qrow5, 6);
qtmp6 = vshrq_n_u8(qrow6, 6);
qtmp7 = vshrq_n_u8(qrow7, 6);
qtmp0 = vsliq_n_u8(qtmp0, qtmp1, 1);
qtmp2 = vsliq_n_u8(qtmp2, qtmp3, 1);
qtmp4 = vsliq_n_u8(qtmp4, qtmp5, 1);
qtmp6 = vsliq_n_u8(qtmp6, qtmp7, 1);
qtmp0 = vsliq_n_u8(qtmp0, qtmp2, 2);
qtmp4 = vsliq_n_u8(qtmp4, qtmp6, 2);
qtmp0 = vsliq_n_u8(qtmp0, qtmp4, 4);
vst1q_u8((uint8_t *)pDst, qtmp0); pDst += 2;
} while (--count);
}
我尽力让编译器生成优化的机器代码,但他们就是不听:godbolt
尤其是 GCC 很糟糕(一如既往)。
我将在明天之前添加汇编版本。