源代码位于 Julia 中,但我的问题更多是关于 LLVM 编译器的行为。我需要一个实用函数来对称化邻接矩阵(基本上将有向图变成无向图),基线实现是
function symmetrize!(A)
m, n = size(A)
@assert m == n
@inbounds for i in 1:(n - 1)
for j in (i + 1):n
x = A[i, j] | A[j, i] # symmetrize
A[i, j] = x
A[j, i] = x
end
end
end
参数
A
是一个由 0 和 1 组成的矩阵,该函数将其对称化。宏 @inbounds
用于关闭边界检查。 (就我而言,这似乎对编译器产生了影响)。在 Julia 中,矩阵的元素按“列主顺序”存储。出于好奇,我检查了编译器生成的 LLVM IR,令我惊讶的是,有一些使用向量指令的基本块:
vector.ph: ; preds = %L43.preheader
%n.vec = and i64 %25, -8
%ind.end = add i64 %21, %n.vec
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%26 = add i64 %value_phi4, %index
%27 = add i64 %23, %26
%28 = getelementptr inbounds i64, i64* %20, i64 %27
%29 = bitcast i64* %28 to <4 x i64>*
%wide.load = load <4 x i64>, <4 x i64>* %29, align 8
%30 = getelementptr inbounds i64, i64* %28, i64 4
%31 = bitcast i64* %30 to <4 x i64>*
%wide.load29 = load <4 x i64>, <4 x i64>* %31, align 8
%32 = add i64 %26, %24
%33 = getelementptr inbounds i64, i64* %20, i64 %32
%34 = bitcast i64* %33 to <4 x i64>*
%wide.load30 = load <4 x i64>, <4 x i64>* %34, align 8
%35 = getelementptr inbounds i64, i64* %33, i64 4
%36 = bitcast i64* %35 to <4 x i64>*
%wide.load31 = load <4 x i64>, <4 x i64>* %36, align 8
%37 = or <4 x i64> %wide.load30, %wide.load
%38 = or <4 x i64> %wide.load31, %wide.load29
store <4 x i64> %37, <4 x i64>* %29, align 8
store <4 x i64> %38, <4 x i64>* %31, align 8
store <4 x i64> %37, <4 x i64>* %34, align 8
store <4 x i64> %38, <4 x i64>* %36, align 8
%index.next = add nuw i64 %index, 8
%39 = icmp eq i64 %index.next, %n.vec
br i1 %39, label %middle.block, label %vector.body
然后我检查了 x64 程序集,发现编译器确实发出了 SIMD 指令。上面基本块对应的部分是
.LBB0_8: # %vector.body
# Parent Loop BB0_4 Depth=1
# => This Inner Loop Header: Depth=2
vmovups -32(%rdx,%r9,8), %ymm0
vmovups (%rdx,%r9,8), %ymm1
vorps -32(%rdi,%r9,8), %ymm0, %ymm0
vorps (%rdi,%r9,8), %ymm1, %ymm1
vmovups %ymm0, -32(%rdi,%r9,8)
vmovups %ymm1, (%rdi,%r9,8)
vmovups %ymm0, -32(%rdx,%r9,8)
vmovups %ymm1, (%rdx,%r9,8)
addq $8, %r9
cmpq %r9, %r10
jne .LBB0_8
最后列出了完整的 LLVM IR 和 x64 程序集。问题。
我无法理解编译器如何能够自动向量化内部循环。看来矢量化代码在一次迭代中覆盖了 16 个条目。但为什么我们可以这样做呢?假设我们有一个 N
逐
N
矩阵,其中 N
非常大。当归纳变量 i
等于 1 时,内部循环应该按列出的顺序对 (A[1, 2], A[2, 1])
、(A[1, 3], A[3, 1])
、(A[1, 4], A[4, 1])
、(A[1, 5], A[5, 1])
等对进行对称化。由于矩阵的条目按列优先顺序存储,因此元素 A[2, 1]
、A[3, 1]
、A[4, 1]
和 A[5, 1]
可以通过单个指令加载,但元素 A[1, 2]
、A[1, 3]
、A[1, 4]
、A[1, 5]
在内存中相距较远(步长为N
),那么我们如何使用向量指令来加载它们呢?也许我完全误解了 LLVM IR/汇编,但我不明白如何矢量化矩阵对称化函数的内部循环。任何帮助将不胜感激!
完整的 LLVM IR:
define void @"julia_symmetrize!_73"({}* noundef nonnull align 16 dereferenceable(40) %0) #0 {
top:
%gcframe33 = alloca [3 x {}*], align 16
%gcframe33.sub = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe33, i64 0, i64 0
%1 = bitcast [3 x {}*]* %gcframe33 to i8*
call void @llvm.memset.p0i8.i32(i8* noundef nonnull align 16 dereferenceable(24) %1, i8 0, i32 24, i1 false)
%2 = call {}*** inttoptr (i64 140712069332416 to {}*** ()*)() #7
%3 = bitcast [3 x {}*]* %gcframe33 to i64*
store i64 4, i64* %3, align 16
%4 = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe33, i64 0, i64 1
%5 = bitcast {}** %4 to {}***
%6 = load {}**, {}*** %2, align 8
store {}** %6, {}*** %5, align 8
%7 = bitcast {}*** %2 to {}***
store {}** %gcframe33.sub, {}*** %7, align 8
%8 = bitcast {}* %0 to {}**
%9 = getelementptr inbounds {}*, {}** %8, i64 3
%10 = bitcast {}** %9 to i64*
%11 = load i64, i64* %10, align 8
%12 = getelementptr inbounds {}*, {}** %8, i64 4
%13 = bitcast {}** %12 to i64*
%14 = load i64, i64* %13, align 8
%.not = icmp eq i64 %11, %14
br i1 %.not, label %L5, label %L73
L5: ; preds = %top
%15 = add nsw i64 %11, -1
%16 = icmp ult i64 %11, 2
%17 = icmp eq i64 %15, 0
%18 = select i1 %16, i1 true, i1 %17
br i1 %18, label %L72, label %L23.preheader
L23.preheader: ; preds = %L5
%19 = bitcast {}* %0 to i64**
%20 = load i64*, i64** %19, align 8
br label %L23
L23: ; preds = %L61, %L23.preheader
%indvar = phi i64 [ 0, %L23.preheader ], [ %indvar.next, %L61 ]
%value_phi4 = phi i64 [ 1, %L23.preheader ], [ %21, %L61 ]
%21 = add i64 %value_phi4, 1
%.not24 = icmp sgt i64 %21, %11
%value_phi6 = select i1 %.not24, i64 %value_phi4, i64 %11
%.not25 = icmp slt i64 %value_phi6, %21
br i1 %.not25, label %L61, label %L43.preheader
L43.preheader: ; preds = %L23
%22 = xor i64 %indvar, -1
%23 = add i64 %value_phi4, -1
%24 = mul i64 %11, %23
%25 = add i64 %value_phi6, %22
%min.iters.check = icmp ugt i64 %25, 7
%ident.check.not = icmp eq i64 %11, 1
%or.cond = select i1 %min.iters.check, i1 %ident.check.not, i1 false
br i1 %or.cond, label %vector.ph, label %scalar.ph
vector.ph: ; preds = %L43.preheader
%n.vec = and i64 %25, -8
%ind.end = add i64 %21, %n.vec
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%26 = add i64 %value_phi4, %index
%27 = add i64 %23, %26
%28 = getelementptr inbounds i64, i64* %20, i64 %27
%29 = bitcast i64* %28 to <4 x i64>*
%wide.load = load <4 x i64>, <4 x i64>* %29, align 8
%30 = getelementptr inbounds i64, i64* %28, i64 4
%31 = bitcast i64* %30 to <4 x i64>*
%wide.load29 = load <4 x i64>, <4 x i64>* %31, align 8
%32 = add i64 %26, %24
%33 = getelementptr inbounds i64, i64* %20, i64 %32
%34 = bitcast i64* %33 to <4 x i64>*
%wide.load30 = load <4 x i64>, <4 x i64>* %34, align 8
%35 = getelementptr inbounds i64, i64* %33, i64 4
%36 = bitcast i64* %35 to <4 x i64>*
%wide.load31 = load <4 x i64>, <4 x i64>* %36, align 8
%37 = or <4 x i64> %wide.load30, %wide.load
%38 = or <4 x i64> %wide.load31, %wide.load29
store <4 x i64> %37, <4 x i64>* %29, align 8
store <4 x i64> %38, <4 x i64>* %31, align 8
store <4 x i64> %37, <4 x i64>* %34, align 8
store <4 x i64> %38, <4 x i64>* %36, align 8
%index.next = add nuw i64 %index, 8
%39 = icmp eq i64 %index.next, %n.vec
br i1 %39, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 %25, %n.vec
br i1 %cmp.n, label %L61, label %scalar.ph
scalar.ph: ; preds = %middle.block, %L43.preheader
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ %21, %L43.preheader ]
br label %L43
L43: ; preds = %L43, %scalar.ph
%value_phi10 = phi i64 [ %49, %L43 ], [ %bc.resume.val, %scalar.ph ]
%40 = add i64 %value_phi10, -1
%41 = mul i64 %40, %11
%42 = add i64 %23, %41
%43 = getelementptr inbounds i64, i64* %20, i64 %42
%44 = load i64, i64* %43, align 8
%45 = add i64 %40, %24
%46 = getelementptr inbounds i64, i64* %20, i64 %45
%47 = load i64, i64* %46, align 8
%48 = or i64 %47, %44
store i64 %48, i64* %43, align 8
store i64 %48, i64* %46, align 8
%.not26.not = icmp eq i64 %value_phi10, %value_phi6
%49 = add i64 %value_phi10, 1
br i1 %.not26.not, label %L61, label %L43
L61: ; preds = %L43, %middle.block, %L23
%.not27 = icmp eq i64 %value_phi4, %15
%indvar.next = add i64 %indvar, 1
br i1 %.not27, label %L72, label %L23
L72: ; preds = %L61, %L5
%50 = load {}*, {}** %4, align 8
%51 = bitcast {}*** %2 to {}**
store {}* %50, {}** %51, align 8
ret void
L73: ; preds = %top
%52 = call [1 x {}*] @j_AssertionError_75({}* inttoptr (i64 2012767056112 to {}*)) #0
%53 = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe33, i64 0, i64 2
%54 = extractvalue [1 x {}*] %52, 0
store {}* %54, {}** %53, align 16
%ptls_field34 = getelementptr inbounds {}**, {}*** %2, i64 2
%55 = bitcast {}*** %ptls_field34 to i8**
%ptls_load3536 = load i8*, i8** %55, align 8
%56 = call noalias nonnull {}* @ijl_gc_pool_alloc(i8* %ptls_load3536, i32 1392, i32 16) #1
%57 = bitcast {}* %56 to i64*
%58 = getelementptr inbounds i64, i64* %57, i64 -1
store atomic i64 140709931037936, i64* %58 unordered, align 8
%59 = bitcast {}* %56 to {}**
store {}* %54, {}** %59, align 8
call void @ijl_throw({}* %56)
unreachable
}
.text
.file "symmetrize!"
.globl "julia_symmetrize!_76" # -- Begin function julia_symmetrize!_76
.p2align 4, 0x90
.type "julia_symmetrize!_76",@function
"julia_symmetrize!_76": # @"julia_symmetrize!_76"
.cfi_startproc
# %bb.0: # %top
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rsi
pushq %rdi
pushq %rbx
subq $120, %rsp
.cfi_offset %rbx, -72
.cfi_offset %rdi, -64
.cfi_offset %rsi, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
movq %rcx, %rdi
movabsq $140709931037936, %rsi # imm = 0x7FF99574B4F0
vxorps %xmm0, %xmm0, %xmm0
vmovaps %xmm0, -96(%rbp)
movq $0, -80(%rbp)
leaq 2138294480(%rsi), %rax
callq *%rax
movq $4, -96(%rbp)
movq (%rax), %rcx
movq %rcx, -88(%rbp)
leaq -96(%rbp), %rcx
movq %rax, -64(%rbp) # 8-byte Spill
movq %rcx, (%rax)
movq 24(%rdi), %r9
cmpq 32(%rdi), %r9
jne .LBB0_14
# %bb.1: # %L5
cmpq $2, %r9
jb .LBB0_13
# %bb.2: # %L5
movq %r9, %r10
decq %r10
je .LBB0_13
# %bb.3: # %L23.preheader
movq (%rdi), %r15
leaq 40(%r15), %rdx
leaq (,%r9,8), %r14
leaq 8(,%r9,8), %rax
movq %rax, -128(%rbp) # 8-byte Spill
leaq -8(%r15), %rbx
movl $1, %r12d
movq $-1, %r13
xorl %r8d, %r8d
movq %rdx, %rdi
movq %r10, -112(%rbp) # 8-byte Spill
movq %r9, -104(%rbp) # 8-byte Spill
jmp .LBB0_4
.p2align 4, 0x90
.LBB0_12: # %L61
# in Loop: Header=BB0_4 Depth=1
incq %r8
addq $16, %rdi
addq -128(%rbp), %rdx # 8-byte Folded Reload
decq %r13
addq %r14, %rbx
addq $8, %r15
cmpq %r10, -136(%rbp) # 8-byte Folded Reload
je .LBB0_13
.LBB0_4: # %L23
# =>This Loop Header: Depth=1
# Child Loop BB0_8 Depth 2
# Child Loop BB0_11 Depth 2
movq %r12, %rax
incq %r12
cmpq %r9, %r12
movq %r9, %rcx
movq %rax, -136(%rbp) # 8-byte Spill
cmovgq %rax, %rcx
cmpq %r12, %rcx
jl .LBB0_12
# %bb.5: # %L43.preheader
# in Loop: Header=BB0_4 Depth=1
movq %r8, %rax
notq %rax
addq %rcx, %rax
movq %r12, %r11
cmpq $8, %rax
jb .LBB0_10
# %bb.6: # %L43.preheader
# in Loop: Header=BB0_4 Depth=1
movq %r12, %r11
cmpq $1, %r9
jne .LBB0_10
# %bb.7: # %vector.ph
# in Loop: Header=BB0_4 Depth=1
movq %rax, %rsi
andq $-8, %rsi
movq %rsi, -120(%rbp) # 8-byte Spill
leaq (%r12,%rsi), %r11
leaq (%rcx,%r13), %r10
andq $-8, %r10
xorl %r9d, %r9d
.p2align 4, 0x90
.LBB0_8: # %vector.body
# Parent Loop BB0_4 Depth=1
# => This Inner Loop Header: Depth=2
vmovups -32(%rdx,%r9,8), %ymm0
vmovups (%rdx,%r9,8), %ymm1
vorps -32(%rdi,%r9,8), %ymm0, %ymm0
vorps (%rdi,%r9,8), %ymm1, %ymm1
vmovups %ymm0, -32(%rdi,%r9,8)
vmovups %ymm1, (%rdi,%r9,8)
vmovups %ymm0, -32(%rdx,%r9,8)
vmovups %ymm1, (%rdx,%r9,8)
addq $8, %r9
cmpq %r9, %r10
jne .LBB0_8
# %bb.9: # %middle.block
# in Loop: Header=BB0_4 Depth=1
cmpq -120(%rbp), %rax # 8-byte Folded Reload
movq -104(%rbp), %r9 # 8-byte Reload
movq -112(%rbp), %r10 # 8-byte Reload
je .LBB0_12
.LBB0_10: # %scalar.ph
# in Loop: Header=BB0_4 Depth=1
leaq -1(%r11), %rax
imulq %r9, %rax
leaq (%r15,%rax,8), %rax
incq %rcx
.p2align 4, 0x90
.LBB0_11: # %L43
# Parent Loop BB0_4 Depth=1
# => This Inner Loop Header: Depth=2
movq (%rbx,%r11,8), %rsi
orq (%rax), %rsi
movq %rsi, (%rax)
movq %rsi, (%rbx,%r11,8)
incq %r11
addq %r14, %rax
cmpq %r11, %rcx
jne .LBB0_11
jmp .LBB0_12
.LBB0_13: # %L72
movq -88(%rbp), %rax
movq -64(%rbp), %rcx # 8-byte Reload
movq %rax, (%rcx)
addq $120, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
vzeroupper
retq
.LBB0_14: # %L73
movabsq $j_AssertionError_78, %rax
movabsq $2012767056112, %rcx # imm = 0x1D4A243F0F0
callq *%rax
movq %rax, %rdi
movq %rax, -80(%rbp)
movq -64(%rbp), %rax # 8-byte Reload
movq 16(%rax), %rcx
movabsq $ijl_gc_pool_alloc, %rax
movl $1392, %edx # imm = 0x570
movl $16, %r8d
callq *%rax
movq %rsi, -8(%rax)
movq %rdi, (%rax)
movabsq $ijl_throw, %rdx
movq %rax, %rcx
callq *%rdx
.Lfunc_end0:
.size "julia_symmetrize!_76", .Lfunc_end0-"julia_symmetrize!_76"
.cfi_endproc
# -- End function
.section ".note.GNU-stack","",@progbits
vector.body
中看到这种展开,最终添加
8
循环索引: %index.next = add nuw i64 %index, 8
在我的 AVX512 机器上,这最终会进一步展开:
[...]
vector.ph.new: ; preds = %vector.ph
%unroll_iter = and i64 %17, 2305843009213693944
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph.new
[...]
; @ REPL[1]:10 within `symmetrize!`
; ┌ @ array.jl:1022 within `setindex!`
store <8 x i64> %121, <8 x i64>* %118, align 8
store <8 x i64> %122, <8 x i64>* %120, align 8
%index.next.7 = add nuw i64 %index, 128
%niter.next.7 = add i64 %niter, 8
%niter.ncmp.7 = icmp eq i64 %niter.next.7, %unroll_iter
br i1 %niter.ncmp.7, label %middle.block.unr-lcssa, label %vector.body
[...]
我不知道你正在运行哪个 Julia 版本,但在我的(仓库的一些最近的本地版本)上,标签
%unroll_iter
和
%middle.block.unr-lcssa
仍然存在,进一步表明这是由于循环展开(以及随后的重新滚动) ).