AVX512-FP16 内在函数在发布模式下失败,在调试模式下工作

问题描述 投票:0回答:1

终于拿到了支持FP16的CPU,想在VS2022中学习AVX512编程。我在 fp16 值上编写了一个简单的 FIR 滤波循环。如果我在调试中编译,一切正常,并且我得到与 fp32 类似的结果,减去一些量化误差。如果我在启用优化的版本中进行编译,则所有示例的输出都将变为 Inf。更奇怪的是,如果我在 FIR 循环内打印任何值(我猜这限制了优化器),FIR 输出是正确的,即使从逻辑上讲这对计算没有影响。

这是 FIR 循环:

                __m256h x1_accumh = _mm256_setzero_ph();

                for (int t = 0; t < taps_samp[ii] + 1; t++)
                {
                    if (!usefp16)
                    {

                         \\ommitted fp32 case that works normally
                    }
                    else //fp16 code path
                    {

                        //load into an XMM register and then broadcast 
                        __m128i b1 = _mm_loadu_si16((barrayh + count++));

                        //take lowest 16 bit value and broadcast
                        __m256h bh = _mm256_broadcastw_epi16(b1);

                        //load all 16 channels of int16 RF data
                        __m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);

                        //convert all 16 channels to fp16
                        __m256h x1hp = _mm256_cvtepi16_ph(x1);

                        //multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh                   
                        x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);

                        if (ii == 0 && i == 0 && t == 0)
                        {
                            print256_f16(x1_accumh);        //uncommenting this fixes decoding???????
                        }

                    }
                }

取消注释打印后,这将返回 x1_accumh 中预期的过滤数据。注释掉后,无论输入如何,所有值始终为 Inf。我检查了反汇编,看起来 _mm256_fmadd_ph 的参数在优化情况下是错误的。

包含打印(有效):

                for (int t = 0; t < taps_samp[ii] + 1; t++)
00007FF6AA2E1B1E  xor         edi,edi  
00007FF6AA2E1B20  cdq  
00007FF6AA2E1B21  sub         eax,edx  
00007FF6AA2E1B23  sar         eax,1  
00007FF6AA2E1B25  sub         ecx,eax  
00007FF6AA2E1B27  lea         eax,[r8+1]  
00007FF6AA2E1B2B  shl         ecx,5  
00007FF6AA2E1B2E  movsxd      r12,ecx  
00007FF6AA2E1B31  add         r12,r10  
00007FF6AA2E1B34  vxorps      xmm3,xmm3,xmm3  
00007FF6AA2E1B38  vmovdqu     ymmword ptr [x1_accumh],ymm3  
00007FF6AA2E1B3D  test        eax,eax  
00007FF6AA2E1B3F  jle         main+0BEAh (07FF6AA2E1BEAh)  
00007FF6AA2E1B45  mov         r14,qword ptr [barrayh]  
00007FF6AA2E1B49  nop         dword ptr [rax]  

                {
                    if (!usefp16)
                    {
                    }
                    else //fp16 code path
                    {

                        //load into an XMM register and then broadcast 
                        __m128i b1 = _mm_loadu_si16((barrayh + count++));
00007FF6AA2E1B50  movzx       eax,word ptr [r14+r15*2]  
00007FF6AA2E1B55  inc         r15  
00007FF6AA2E1B58  vmovd       xmm0,eax  

                        //take lowest 16 bit value and broadcast
                        __m256h bh = _mm256_broadcastw_epi16(b1);
00007FF6AA2E1B5C  vpbroadcastw ymm2,xmm0  

                        //load all 16 channels of int16 RF data
                        __m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);
00007FF6AA2E1B61  movsxd      rax,edi  
00007FF6AA2E1B64  shl         rax,5  
00007FF6AA2E1B68  vlddqu      ymm0,ymmword ptr [rax+r12]  

                        //convert all 16 channels to fp16
                        __m256h x1hp = _mm256_cvtepi16_ph(x1);
00007FF6AA2E1B6E  vcvtw2ph    ymm1,ymm0  

                        //multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh                   
                        x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);
00007FF6AA2E1B74  vfmadd213ph ymm1,ymm2,ymm3    //ymm1 = rf samples, ymm2 = bh coefficient, ymm3 = accumulator, so this does rf samples * bh coefficients + accumulator
00007FF6AA2E1B7A  vmovdqu     ymm3,ymm1  
00007FF6AA2E1B7E  vmovdqu     ymmword ptr [x1_accumh],ymm1  


                        if (ii == 0 && i == 0 && t == 0)
00007FF6AA2E1B83  test        rsi,rsi  
00007FF6AA2E1B86  jne         main+0BD3h (07FF6AA2E1BD3h)  
00007FF6AA2E1B88  test        r9d,r9d  
00007FF6AA2E1B8B  jne         main+0BD3h (07FF6AA2E1BD3h)  
00007FF6AA2E1B8D  test        edi,edi  
00007FF6AA2E1B8F  jne         main+0BD3h (07FF6AA2E1BD3h)  
                        {
                            print256_f16(x1_accumh);        //uncommenting this fixes decoding???????
00007FF6AA2E1B91  vcvtph2ps   zmm0,ymm1  
00007FF6AA2E1B97  vmovups     zmmword ptr [frac_scalar],zmm0  
00007FF6AA2E1B9E  xor         ebx,ebx  
00007FF6AA2E1BA0  vmovss      xmm1,dword ptr [rbp+rbx*4+80h]  
00007FF6AA2E1BA9  vcvtss2sd   xmm1,xmm1,xmm1  
00007FF6AA2E1BAD  vmovq       rdx,xmm1  
00007FF6AA2E1BB2  lea         rcx,[string "%f\n" (07FF6AA2E42C0h)]  
00007FF6AA2E1BB9  vzeroupper  
00007FF6AA2E1BBC  call        printf (07FF6AA2E2C60h)  
00007FF6AA2E1BC1  inc         rbx  
00007FF6AA2E1BC4  cmp         rbx,10h  
00007FF6AA2E1BC8  jl          main+0BA0h (07FF6AA2E1BA0h)  
00007FF6AA2E1BCA  vmovdqu     ymm3,ymmword ptr [x1_accumh]  
00007FF6AA2E1BCF  mov         r9d,dword ptr [rbp]  

                __m256 x1_accum = _mm256_setzero_ps();
                __m256 x2_accum = _mm256_setzero_ps();

                __m256h x1_accumh = _mm256_setzero_ph();

                for (int t = 0; t < taps_samp[ii] + 1; t++)
00007FF6AA2E1BD3  mov         eax,dword ptr taps_samp[rsi*4]  
00007FF6AA2E1BDA  inc         edi  
00007FF6AA2E1BDC  inc         eax  
00007FF6AA2E1BDE  cmp         edi,eax  
00007FF6AA2E1BE0  jl          main+0B50h (07FF6AA2E1B50h)  
00007FF6AA2E1BE6  mov         r14,qword ptr [filtered_data]  
                        }

                    }
                }

这是没有打印的优化版本,不起作用:


            {
                __m128i* startPtr = (__m128i* ) (linedata+ (ii - taps_samp[ii] / 2) * sampleSize);
00007FF64FE11B2B  mov         ecx,r8d  
00007FF64FE11B2E  nop  
                {

                    if (!usefp16)
                    {

                    }
                    else //fp16 code path
                    {

                        //load into an XMM register and then broadcast 
                        __m128i b1 = _mm_loadu_si16((barrayh + count++));
00007FF64FE11B30  movzx       eax,word ptr [r13+rdi*2]  
00007FF64FE11B36  inc         rdi  

                        //take lowest 16 bit value and broadcast
                        __m256h bh = _mm256_broadcastw_epi16(b1);

                        //load all 16 channels of int16 RF data
                        __m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);
00007FF64FE11B39  vlddqu      ymm1,ymmword ptr [rdx]  
00007FF64FE11B3D  lea         rdx,[rdx+20h]  
00007FF64FE11B41  vmovd       xmm0,eax  
00007FF64FE11B45  vpbroadcastw ymm2,xmm0  

                        //convert all 16 channels to fp16
                        __m256h x1hp = _mm256_cvtepi16_ph(x1);
00007FF64FE11B4A  vcvtw2ph    ymm0,ymm1  

                        //multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh                   
                        x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);
00007FF64FE11B50  vfmadd132ph ymm3,ymm2,ymm0  \\ymm3 =accumulator, ymm2 = b coefficients, ymm0 = sample data, so this does accumulator *sample data + b coefficient

00007FF64FE11B56  sub         rcx,1  
00007FF64FE11B5A  jne         main+0B30h (07FF64FE11B30h)  
00007FF64FE11B5C  vmovdqu     ymmword ptr [x1_accumh],ymm3  

                        /*if (ii == 0 && i == 0 && t == 0)
                        {
                            print256_f16(x1_accumh);        //uncommenting this fixes decoding???????
                        }*/

                    }

我对 x86 汇编非常陌生,但关键的区别似乎是

的变化
vfmadd213ph ymm1,ymm2,ymm3      //ymm1 = rf samples, ymm2 = b coefficient, ymm3 = accumulator, so this does rf samples * b coefficients + accumulator

vfmadd132ph ymm3,ymm2,ymm0  \\ymm3 =accumulator, ymm2 = b coefficients, ymm0 = sample data, so this does accumulator *sample data + b coefficient

看起来编译器将 vfmadd213ph 更改为 vfmadd132ph 但没有重新排序参数(b 保留在 ymm2 中),因此它不是将 FIR 抽头系数乘以样本并添加到累加器,而是将累加器乘以样本数据,然后然后将其添加到 FIR 抽头系数中。结果是 Inf,因为这个值很快就会变得巨大。

问题:我是否完全误解了 AVX 编程的工作原理并犯了一些微不足道的代码错误?我能做些什么来解决这个问题吗?这是我的第一个 AVX512 程序,所以我可能做了一些非常愚蠢的事情。

visual-studio intrinsics avx512
1个回答
0
投票

该bug已被微软确认,并在VS 17.7.0中修正,可以正确编译上述代码。

© www.soinside.com 2019 - 2024. All rights reserved.