AVX512 汇编和 C++ 矩阵向量函数有奇怪的速度差异

问题描述 投票:0回答:1

我尝试编写一些函数来使用单个矩阵和源向量数组来执行矩阵向量乘法。我曾经用 C++ 编写过这些函数,并在 x86 AVX512 Assembly 中编写过一次,以将性能与英特尔 VTune Profiler 进行比较。当使用源向量数组作为目标数组时,汇编变体的执行速度比 C++ 对应项快 3.5 倍到 10 倍,但是当使用不同的源和目标数组时,汇编变体的性能几乎不比 C++ 对应项更好,实现几乎相同的性能...有时甚至更糟。

我无法理解的另一件事是,为什么在使用不同的源和目标数组时,C++ 对应版本甚至可以达到与汇编变体接近相同或更好的性能水平,即使汇编代码要短得多并且也根据静态分析工具 uica 和 llvm-mca 速度提高数倍。 uica.uops.info

我不想让这篇文章写得太长,所以我只发布执行 mat4-vec4 乘法的函数的代码。

这是汇编变体的代码,它假设矩阵要转置:

alignas(64) uint32_t mat4_mul_vec4_avx512_vpermps_index[64]{    0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12,
                                                            1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13,
                                                            2, 2, 2, 2, 6, 6, 6, 6, 10, 10, 10, 10, 14, 14, 14, 14,
                                                            3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15 };

void __declspec(naked, align(64)) mat4_mul_vec4_avx512(vec4f_t* destination, const mat4f_t& src1, const vec4f_t* src2, uint32_t vector_count) {
__asm {
    vbroadcastf32x4 zmm16, xmmword ptr[rdx]
    vbroadcastf32x4 zmm17, xmmword ptr[rdx + 16]

    vbroadcastf32x4 zmm18, xmmword ptr[rdx + 32]
    vbroadcastf32x4 zmm19, xmmword ptr[rdx + 48]

    vmovups zmm20, zmmword ptr[mat4_mul_vec4_avx512_vpermps_index]
    vmovups zmm21, zmmword ptr[mat4_mul_vec4_avx512_vpermps_index + 64]

    vmovups zmm22, zmmword ptr[mat4_mul_vec4_avx512_vpermps_index + 128]
    vmovups zmm23, zmmword ptr[mat4_mul_vec4_avx512_vpermps_index + 192]

    vmovups zmm24, zmmword ptr[r8]

    vpermps zmm25, zmm20, zmm24
    vpermps zmm26, zmm21, zmm24
    vpermps zmm27, zmm22, zmm24
    vpermps zmm28, zmm23, zmm24

    xor eax, eax

    align 32
    mat4_mul_vec4_avx512_loop:

        vmovups zmm24, zmmword ptr[r8+rax+64]

        vmulps zmm29, zmm16, zmm25
        vpermps zmm25, zmm20, zmm24

        vfmadd231ps zmm29, zmm17, zmm26
        vpermps zmm26, zmm21, zmm24

        vfmadd231ps zmm29, zmm18, zmm27
        vpermps zmm27, zmm22, zmm24

        vfmadd231ps zmm29, zmm19, zmm28
        vpermps zmm28, zmm23, zmm24

        vmovups zmmword ptr[rcx+rax], zmm29

    add rax, 64

    sub r9, 4
    jnz mat4_mul_vec4_avx512_loop

    ret
    }
}

这是 C++ 变体,它假设矩阵不被转置:

void mat4_mul_vec4_cpp(vec4f_t* destination, const mat4f_t& src1, const vec4f_t* src2, uint32_t vector_count) {
for (uint32_t i0{}; i0 < vector_count; ++i0) {
    destination[i0].element_[0] = src1.element_[0][0] * src2[i0].element_[0] + src1.element_[0][1] * src2[i0].element_[1] + src1.element_[0][2] * src2[i0].element_[2] + src1.element_[0][3] * src2[i0].element_[3];
    destination[i0].element_[1] = src1.element_[1][0] * src2[i0].element_[0] + src1.element_[1][1] * src2[i0].element_[1] + src1.element_[1][2] * src2[i0].element_[2] + src1.element_[1][3] * src2[i0].element_[3];
    destination[i0].element_[2] = src1.element_[2][0] * src2[i0].element_[0] + src1.element_[2][1] * src2[i0].element_[1] + src1.element_[2][2] * src2[i0].element_[2] + src1.element_[2][3] * src2[i0].element_[3];
    destination[i0].element_[3] = src1.element_[3][0] * src2[i0].element_[0] + src1.element_[3][1] * src2[i0].element_[1] + src1.element_[3][2] * src2[i0].element_[2] + src1.element_[3][3] * src2[i0].element_[3];
    }
}

英特尔 C++ 编译器生成以下汇编代码:

00007FF69F123D50  sub         rsp,38h  
00007FF69F123D54  vmovaps     xmmword ptr [rsp+20h],xmm8  
00007FF69F123D5A  vmovaps     xmmword ptr [rsp+10h],xmm7  
00007FF69F123D60  vmovaps     xmmword ptr [rsp],xmm6  
        destination[i0].element_[0] = src1.element_[0][0] * src2[i0].element_[0] + src1.element_[0][1] * src2[i0].element_[1] + src1.element_[0][2] * src2[i0].element_[2] + src1.element_[0][3] * src2[i0].element_[3];
00007FF69F123D65  vmovss      xmm0,dword ptr [rdx]  
00007FF69F123D69  vmovss      xmm1,dword ptr [rdx+4]  
00007FF69F123D6E  vmovss      xmm2,dword ptr [rdx+8]  
00007FF69F123D73  vmovss      xmm3,dword ptr [rdx+0Ch]  
        destination[i0].element_[1] = src1.element_[1][0] * src2[i0].element_[0] + src1.element_[1][1] * src2[i0].element_[1] + src1.element_[1][2] * src2[i0].element_[2] + src1.element_[1][3] * src2[i0].element_[3];
00007FF69F123D78  vmovss      xmm4,dword ptr [rdx+10h]  
00007FF69F123D7D  vmovss      xmm5,dword ptr [rdx+14h]  
00007FF69F123D82  vmovss      xmm16,dword ptr [rdx+18h]  
00007FF69F123D89  vmovss      xmm17,dword ptr [rdx+1Ch]  
        destination[i0].element_[2] = src1.element_[2][0] * src2[i0].element_[0] + src1.element_[2][1] * src2[i0].element_[1] + src1.element_[2][2] * src2[i0].element_[2] + src1.element_[2][3] * src2[i0].element_[3];
00007FF69F123D90  vmovss      xmm18,dword ptr [rdx+20h]  
00007FF69F123D97  vmovss      xmm19,dword ptr [rdx+24h]  
00007FF69F123D9E  vmovss      xmm20,dword ptr [rdx+28h]  
00007FF69F123DA5  vmovss      xmm21,dword ptr [rdx+2Ch]  
        destination[i0].element_[3] = src1.element_[3][0] * src2[i0].element_[0] + src1.element_[3][1] * src2[i0].element_[1] + +src1.element_[3][2] * src2[i0].element_[2] + src1.element_[3][3] * src2[i0].element_[3];
00007FF69F123DAC  vmovss      xmm22,dword ptr [rdx+30h]  
00007FF69F123DB3  vmovss      xmm23,dword ptr [rdx+34h]  
00007FF69F123DBA  vmovss      xmm24,dword ptr [rdx+38h]  
00007FF69F123DC1  vmovss      xmm25,dword ptr [rdx+3Ch]  
    for (uint32_t i0{}; i0 < vector_count; ++i0) {
00007FF69F123DC8  lea         rax,[r8+3A9800h]  
00007FF69F123DCF  cmp         rax,rcx  
00007FF69F123DD2  jbe         mat4_mul_vec4_cpp+150h (07FF69F123EA0h)  
00007FF69F123DD8  lea         rax,[rcx+3A9800h]  
00007FF69F123DDF  cmp         rax,r8  
00007FF69F123DE2  jbe         mat4_mul_vec4_cpp+150h (07FF69F123EA0h)  
00007FF69F123DE8  mov         eax,0Ch  
00007FF69F123DED  nop         dword ptr [rax]  
        destination[i0].element_[0] = src1.element_[0][0] * src2[i0].element_[0] + src1.element_[0][1] * src2[i0].element_[1] + src1.element_[0][2] * src2[i0].element_[2] + src1.element_[0][3] * src2[i0].element_[3];
00007FF69F123DF0  vmulss      xmm26,xmm0,dword ptr [r8+rax-0Ch]  
00007FF69F123DF8  vfmadd231ss xmm26,xmm1,dword ptr [r8+rax-8]  
00007FF69F123E00  vfmadd231ss xmm26,xmm2,dword ptr [r8+rax-4]  
00007FF69F123E08  vfmadd231ss xmm26,xmm3,dword ptr [r8+rax]  
00007FF69F123E0F  vmovss      dword ptr [rcx+rax-0Ch],xmm26  
        destination[i0].element_[1] = src1.element_[1][0] * src2[i0].element_[0] + src1.element_[1][1] * src2[i0].element_[1] + src1.element_[1][2] * src2[i0].element_[2] + src1.element_[1][3] * src2[i0].element_[3];
00007FF69F123E17  vmulss      xmm26,xmm4,dword ptr [r8+rax-0Ch]  
00007FF69F123E1F  vfmadd231ss xmm26,xmm5,dword ptr [r8+rax-8]  
00007FF69F123E27  vfmadd231ss xmm26,xmm16,dword ptr [r8+rax-4]  
00007FF69F123E2F  vfmadd231ss xmm26,xmm17,dword ptr [r8+rax]  
00007FF69F123E36  vmovss      dword ptr [rcx+rax-8],xmm26  
        destination[i0].element_[2] = src1.element_[2][0] * src2[i0].element_[0] + src1.element_[2][1] * src2[i0].element_[1] + src1.element_[2][2] * src2[i0].element_[2] + src1.element_[2][3] * src2[i0].element_[3];
00007FF69F123E3E  vmulss      xmm26,xmm18,dword ptr [r8+rax-0Ch]  
00007FF69F123E46  vfmadd231ss xmm26,xmm19,dword ptr [r8+rax-8]  
00007FF69F123E4E  vfmadd231ss xmm26,xmm20,dword ptr [r8+rax-4]  
00007FF69F123E56  vfmadd231ss xmm26,xmm21,dword ptr [r8+rax]  
00007FF69F123E5D  vmovss      dword ptr [rcx+rax-4],xmm26  
        destination[i0].element_[3] = src1.element_[3][0] * src2[i0].element_[0] + src1.element_[3][1] * src2[i0].element_[1] + +src1.element_[3][2] * src2[i0].element_[2] + src1.element_[3][3] * src2[i0].element_[3];
00007FF69F123E65  vmulss      xmm26,xmm22,dword ptr [r8+rax-0Ch]  
00007FF69F123E6D  vfmadd231ss xmm26,xmm23,dword ptr [r8+rax-8]  
00007FF69F123E75  vfmadd231ss xmm26,xmm24,dword ptr [r8+rax-4]  
00007FF69F123E7D  vfmadd231ss xmm26,xmm25,dword ptr [r8+rax]  
00007FF69F123E84  vmovss      dword ptr [rcx+rax],xmm26  
    for (uint32_t i0{}; i0 < vector_count; ++i0) {
00007FF69F123E8B  add         rax,10h  
00007FF69F123E8F  cmp         rax,3A980Ch  
00007FF69F123E95  jne         mat4_mul_vec4_cpp+0A0h (07FF69F123DF0h)  
00007FF69F123E9B  jmp         mat4_mul_vec4_cpp+2FEh (07FF69F12404Eh)  
00007FF69F123EA0  vbroadcastss ymm0,xmm0  
00007FF69F123EA5  vbroadcastss ymm1,xmm1  
00007FF69F123EAA  vbroadcastss ymm2,xmm2  
00007FF69F123EAF  vbroadcastss ymm3,xmm3  
00007FF69F123EB4  vbroadcastss ymm4,xmm4  
00007FF69F123EB9  vbroadcastss ymm5,xmm5  
00007FF69F123EBE  vbroadcastss ymm16,xmm16  
00007FF69F123EC4  vbroadcastss ymm17,xmm17  
00007FF69F123ECA  vbroadcastss ymm18,xmm18  
00007FF69F123ED0  vbroadcastss ymm19,xmm19  
00007FF69F123ED6  vbroadcastss ymm20,xmm20  
00007FF69F123EDC  vbroadcastss ymm21,xmm21  
00007FF69F123EE2  vbroadcastss ymm22,xmm22  
00007FF69F123EE8  vbroadcastss ymm23,xmm23  
00007FF69F123EEE  vbroadcastss ymm24,xmm24  
00007FF69F123EF4  vbroadcastss ymm25,xmm25  
00007FF69F123EFA  xor         eax,eax  
        destination[i0].element_[0] = src1.element_[0][0] * src2[i0].element_[0] + src1.element_[0][1] * src2[i0].element_[1] + src1.element_[0][2] * src2[i0].element_[2] + src1.element_[0][3] * src2[i0].element_[3];
00007FF69F123EFC  vmovups     xmm26,xmmword ptr [r8+rax]  
00007FF69F123F03  vmovups     xmm27,xmmword ptr [r8+rax+10h]  
00007FF69F123F0B  vmovups     xmm28,xmmword ptr [r8+rax+20h]  
00007FF69F123F13  vmovups     xmm29,xmmword ptr [r8+rax+30h]  
00007FF69F123F1B  vinsertf32x4 ymm26,ymm26,xmmword ptr [r8+rax+40h],1  
00007FF69F123F24  vinsertf32x4 ymm27,ymm27,xmmword ptr [r8+rax+50h],1  
00007FF69F123F2D  vinsertf32x4 ymm28,ymm28,xmmword ptr [r8+rax+60h],1  
00007FF69F123F36  vinsertf32x4 ymm29,ymm29,xmmword ptr [r8+rax+70h],1  
00007FF69F123F3F  vshufps     ymm30,ymm26,ymm27,14h  
00007FF69F123F46  vshufps     ymm31,ymm29,ymm28,41h  
00007FF69F123F4D  vshufps     ymm6,ymm30,ymm31,6Ch  
00007FF69F123F54  vmulps      ymm7,ymm6,ymm0  
        destination[i0].element_[1] = src1.element_[1][0] * src2[i0].element_[0] + src1.element_[1][1] * src2[i0].element_[1] + src1.element_[1][2] * src2[i0].element_[2] + src1.element_[1][3] * src2[i0].element_[3];
00007FF69F123F58  vmulps      ymm8,ymm6,ymm4  
        destination[i0].element_[0] = src1.element_[0][0] * src2[i0].element_[0] + src1.element_[0][1] * src2[i0].element_[1] + src1.element_[0][2] * src2[i0].element_[2] + src1.element_[0][3] * src2[i0].element_[3];
00007FF69F123F5C  vshufps     ymm30,ymm30,ymm31,39h  
        destination[i0].element_[2] = src1.element_[2][0] * src2[i0].element_[0] + src1.element_[2][1] * src2[i0].element_[1] + src1.element_[2][2] * src2[i0].element_[2] + src1.element_[2][3] * src2[i0].element_[3];
00007FF69F123F63  vmulps      ymm31,ymm6,ymm18  
        destination[i0].element_[3] = src1.element_[3][0] * src2[i0].element_[0] + src1.element_[3][1] * src2[i0].element_[1] + +src1.element_[3][2] * src2[i0].element_[2] + src1.element_[3][3] * src2[i0].element_[3];
00007FF69F123F69  vmulps      ymm6,ymm6,ymm22  
        destination[i0].element_[0] = src1.element_[0][0] * src2[i0].element_[0] + src1.element_[0][1] * src2[i0].element_[1] + src1.element_[0][2] * src2[i0].element_[2] + src1.element_[0][3] * src2[i0].element_[3];
00007FF69F123F6F  vfmadd231ps ymm7,ymm30,ymm1  
        destination[i0].element_[1] = src1.element_[1][0] * src2[i0].element_[0] + src1.element_[1][1] * src2[i0].element_[1] + src1.element_[1][2] * src2[i0].element_[2] + src1.element_[1][3] * src2[i0].element_[3];
00007FF69F123F75  vfmadd231ps ymm8,ymm30,ymm5  
        destination[i0].element_[2] = src1.element_[2][0] * src2[i0].element_[0] + src1.element_[2][1] * src2[i0].element_[1] + src1.element_[2][2] * src2[i0].element_[2] + src1.element_[2][3] * src2[i0].element_[3];
00007FF69F123F7B  vfmadd231ps ymm31,ymm30,ymm19  
        destination[i0].element_[3] = src1.element_[3][0] * src2[i0].element_[0] + src1.element_[3][1] * src2[i0].element_[1] + +src1.element_[3][2] * src2[i0].element_[2] + src1.element_[3][3] * src2[i0].element_[3];
00007FF69F123F81  vfmadd231ps ymm6,ymm23,ymm30  
        destination[i0].element_[0] = src1.element_[0][0] * src2[i0].element_[0] + src1.element_[0][1] * src2[i0].element_[1] + src1.element_[0][2] * src2[i0].element_[2] + src1.element_[0][3] * src2[i0].element_[3];
00007FF69F123F87  vshufps     ymm26,ymm26,ymm27,0BEh  
00007FF69F123F8E  vshufps     ymm27,ymm29,ymm28,0EBh  
00007FF69F123F95  vshufps     ymm28,ymm26,ymm27,6Ch  
00007FF69F123F9C  vfmadd231ps ymm7,ymm28,ymm2  
        destination[i0].element_[1] = src1.element_[1][0] * src2[i0].element_[0] + src1.element_[1][1] * src2[i0].element_[1] + src1.element_[1][2] * src2[i0].element_[2] + src1.element_[1][3] * src2[i0].element_[3];
00007FF69F123FA2  vfmadd231ps ymm8,ymm28,ymm16  
        destination[i0].element_[2] = src1.element_[2][0] * src2[i0].element_[0] + src1.element_[2][1] * src2[i0].element_[1] + src1.element_[2][2] * src2[i0].element_[2] + src1.element_[2][3] * src2[i0].element_[3];
00007FF69F123FA8  vfmadd231ps ymm31,ymm28,ymm20  
        destination[i0].element_[3] = src1.element_[3][0] * src2[i0].element_[0] + src1.element_[3][1] * src2[i0].element_[1] + +src1.element_[3][2] * src2[i0].element_[2] + src1.element_[3][3] * src2[i0].element_[3];
00007FF69F123FAE  vfmadd231ps ymm6,ymm24,ymm28  
        destination[i0].element_[0] = src1.element_[0][0] * src2[i0].element_[0] + src1.element_[0][1] * src2[i0].element_[1] + src1.element_[0][2] * src2[i0].element_[2] + src1.element_[0][3] * src2[i0].element_[3];
00007FF69F123FB4  vshufps     ymm26,ymm26,ymm27,39h  
00007FF69F123FBB  vfmadd231ps ymm7,ymm26,ymm3  
        destination[i0].element_[1] = src1.element_[1][0] * src2[i0].element_[0] + src1.element_[1][1] * src2[i0].element_[1] + src1.element_[1][2] * src2[i0].element_[2] + src1.element_[1][3] * src2[i0].element_[3];
00007FF69F123FC1  vfmadd231ps ymm8,ymm26,ymm17  
        destination[i0].element_[2] = src1.element_[2][0] * src2[i0].element_[0] + src1.element_[2][1] * src2[i0].element_[1] + src1.element_[2][2] * src2[i0].element_[2] + src1.element_[2][3] * src2[i0].element_[3];
00007FF69F123FC7  vfmadd231ps ymm31,ymm26,ymm21  
        destination[i0].element_[3] = src1.element_[3][0] * src2[i0].element_[0] + src1.element_[3][1] * src2[i0].element_[1] + +src1.element_[3][2] * src2[i0].element_[2] + src1.element_[3][3] * src2[i0].element_[3];
00007FF69F123FCD  vfmadd231ps ymm6,ymm25,ymm26  
00007FF69F123FD3  vpunpckldq  ymm26,ymm7,ymm8  
00007FF69F123FD9  vpunpckldq  ymm27,ymm31,ymm6  
00007FF69F123FDF  vpunpckhdq  ymm28,ymm7,ymm8  
00007FF69F123FE5  vpunpckhdq  ymm29,ymm31,ymm6  
00007FF69F123FEB  vpunpcklqdq ymm30,ymm26,ymm27  
00007FF69F123FF1  vpunpckhqdq ymm26,ymm26,ymm27  
00007FF69F123FF7  vpunpcklqdq ymm27,ymm28,ymm29  
00007FF69F123FFD  vpunpckhqdq ymm28,ymm28,ymm29  
00007FF69F124003  vinsertf32x4 ymm29,ymm30,xmm26,1  
00007FF69F12400A  vmovups     ymmword ptr [rcx+rax],ymm29  
00007FF69F124011  vinsertf32x4 ymm29,ymm27,xmm28,1  
00007FF69F124018  vmovups     ymmword ptr [rcx+rax+20h],ymm29  
00007FF69F124020  vshuff64x2  ymm26,ymm30,ymm26,3  
00007FF69F124027  vmovupd     ymmword ptr [rcx+rax+40h],ymm26  
00007FF69F12402F  vshuff64x2  ymm26,ymm27,ymm28,3  
00007FF69F124036  vmovupd     ymmword ptr [rcx+rax+60h],ymm26  
    for (uint32_t i0{}; i0 < vector_count; ++i0) {
00007FF69F12403E  sub         rax,0FFFFFFFFFFFFFF80h  
00007FF69F124042  cmp         rax,3A9800h  
00007FF69F124048  jne         mat4_mul_vec4_cpp+1ACh (07FF69F123EFCh)  
    }
}
00007FF69F12404E  vmovaps     xmm6,xmmword ptr [rsp]  
00007FF69F124053  vmovaps     xmm7,xmmword ptr [rsp+10h]  
00007FF69F124059  vmovaps     xmm8,xmmword ptr [rsp+20h]  
00007FF69F12405F  add         rsp,38h  
00007FF69F124063  vzeroupper  
00007FF69F124066  ret  

这是使用相同的源和目标向量数组时基准测试的调用代码:

int main() {
    SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);

    SetThreadPriorityBoost(GetCurrentThread(), false);
    SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);

    mat4f_t matrix{ {0, 1, 2, 3,
                    4, 5, 6, 7,
                    8, 9, 10, 11,
                    12, 13, 14, 15} };

    vec4f_t* dst_vector = new vec4f_t[1'000'000]{};
    vec4f_t* src_vector = new vec4f_t[1'000'000]{};

    vec3f_t* dst_vector0 = new vec3f_t[1'000'000]{};
    vec3f_t* src_vector0 = new vec3f_t[1'000'000]{};

    for (uint32_t i0{}; i0 < 1000000; ++i0) {
        src_vector[i0] = vec4f_t{ (float)i0, (float)i0, (float)i0, (float)i0 };
        src_vector0[i0] = vec3f_t{ (float)i0, (float)i0, (float)i0 };
    }

    set_mxcsr0(0b1111'1111'1100'0000);

    for (uint64_t i0{}; i0 < 30000; ++i0) {
        for (uint32_t i1{}; i1 < 16; ++i1) {
            reinterpret_cast<float*>(matrix.element_)[i1] = 1. / i0;
        }
        mat4_mul_vec4_avx512(src_vector, matrix, src_vector, 240000);
        mat4_mul_vec4_cpp(src_vector, matrix, src_vector, 240000);
        mat4_mul_vec3_avx512(src_vector0, matrix, src_vector0, 240000);
        mat4_mul_vec3_cpp(src_vector0, matrix, src_vector0, 240000);
        fps();
    }

    for (uint32_t i0{}; i0 < 1000000; ++i0) {
        std::cout << src_vector0[i0] << std::endl;
        std::cout << src_vector[i0] << std::endl;
    }
}

使用不同的源和目标数组进行测试时,第一个参数 src_vector/src_vector0 将替换为 dst_vector/dst_vector0。

这些是使用相同源和目标数组时的基准测试结果:

Same source/destination arrays, Assembly performs much better

这些是使用不同源和目标阵列时的基准测试结果:

Different source/destination arrays, Assembly performs nearly the same as C++

基准测试是在运行 Windows 11 且配备第 11 代 i7-11850H Tiger Lake CPU 的计算机上使用 Intel C++ Compiler 2024 以及如前所述的 Intel VTune Profiler 创建的。

有几件事我不明白:

  1. 为什么使用不同/相同的源和目标阵列时速度会有所不同? 这种情况和缓存有什么关系吗?

  2. 为什么在使用不同的源数组和目标数组时,C++ 对应版本甚至能在汇编变体附近达到如此好的性能?

感谢您的帮助。我真的很感激。

c++ assembly x86 x86-64 avx512
1个回答
0
投票

您的第一个 VTune 屏幕截图(源 = 目标版本)显示运行 100% 标量操作的 cpp 版本(最右列),与运行 100% 打包 SIMD FP 操作的单独目标情况相比。

自动向量化检查重叠并在输出与任何输入重叠时回退到标量循环是很常见的,而不是制作专门用于 dst=src 的第三个版本(例如,如果仍然符合 C++ 语义,则可能首先加载所有内容)对于这种情况)。除非你的消息来源有特殊情况;也许可以尝试一下。


编译器完全展开了 asm,因此静态代码大小比循环版本大,并使用更复杂的洗牌。我还没有查看你的 asm 或编译器的细节来看看有什么改进的空间,但想必编译器甚至坚持使用默认的 256 位向量也有一些改进的空间。

    

© www.soinside.com 2019 - 2024. All rights reserved.