终于拿到了支持FP16的CPU,想在VS2022中学习AVX512编程。我在 fp16 值上编写了一个简单的 FIR 滤波循环。如果我在调试中编译,一切正常,并且我得到与 fp32 类似的结果,减去一些量化误差。如果我在启用优化的版本中进行编译,则所有示例的输出都将变为 Inf。更奇怪的是,如果我在 FIR 循环内打印任何值(我猜这限制了优化器),FIR 输出是正确的,即使从逻辑上讲这对计算没有影响。
这是 FIR 循环:
__m256h x1_accumh = _mm256_setzero_ph();
for (int t = 0; t < taps_samp[ii] + 1; t++)
{
if (!usefp16)
{
\\ommitted fp32 case that works normally
}
else //fp16 code path
{
//load into an XMM register and then broadcast
__m128i b1 = _mm_loadu_si16((barrayh + count++));
//take lowest 16 bit value and broadcast
__m256h bh = _mm256_broadcastw_epi16(b1);
//load all 16 channels of int16 RF data
__m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);
//convert all 16 channels to fp16
__m256h x1hp = _mm256_cvtepi16_ph(x1);
//multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh
x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);
if (ii == 0 && i == 0 && t == 0)
{
print256_f16(x1_accumh); //uncommenting this fixes decoding???????
}
}
}
取消注释打印后,这将返回 x1_accumh 中预期的过滤数据。注释掉后,无论输入如何,所有值始终为 Inf。我检查了反汇编,看起来 _mm256_fmadd_ph 的参数在优化情况下是错误的。
包含打印(有效):
for (int t = 0; t < taps_samp[ii] + 1; t++)
00007FF6AA2E1B1E xor edi,edi
00007FF6AA2E1B20 cdq
00007FF6AA2E1B21 sub eax,edx
00007FF6AA2E1B23 sar eax,1
00007FF6AA2E1B25 sub ecx,eax
00007FF6AA2E1B27 lea eax,[r8+1]
00007FF6AA2E1B2B shl ecx,5
00007FF6AA2E1B2E movsxd r12,ecx
00007FF6AA2E1B31 add r12,r10
00007FF6AA2E1B34 vxorps xmm3,xmm3,xmm3
00007FF6AA2E1B38 vmovdqu ymmword ptr [x1_accumh],ymm3
00007FF6AA2E1B3D test eax,eax
00007FF6AA2E1B3F jle main+0BEAh (07FF6AA2E1BEAh)
00007FF6AA2E1B45 mov r14,qword ptr [barrayh]
00007FF6AA2E1B49 nop dword ptr [rax]
{
if (!usefp16)
{
}
else //fp16 code path
{
//load into an XMM register and then broadcast
__m128i b1 = _mm_loadu_si16((barrayh + count++));
00007FF6AA2E1B50 movzx eax,word ptr [r14+r15*2]
00007FF6AA2E1B55 inc r15
00007FF6AA2E1B58 vmovd xmm0,eax
//take lowest 16 bit value and broadcast
__m256h bh = _mm256_broadcastw_epi16(b1);
00007FF6AA2E1B5C vpbroadcastw ymm2,xmm0
//load all 16 channels of int16 RF data
__m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);
00007FF6AA2E1B61 movsxd rax,edi
00007FF6AA2E1B64 shl rax,5
00007FF6AA2E1B68 vlddqu ymm0,ymmword ptr [rax+r12]
//convert all 16 channels to fp16
__m256h x1hp = _mm256_cvtepi16_ph(x1);
00007FF6AA2E1B6E vcvtw2ph ymm1,ymm0
//multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh
x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);
00007FF6AA2E1B74 vfmadd213ph ymm1,ymm2,ymm3 //ymm1 = rf samples, ymm2 = bh coefficient, ymm3 = accumulator, so this does rf samples * bh coefficients + accumulator
00007FF6AA2E1B7A vmovdqu ymm3,ymm1
00007FF6AA2E1B7E vmovdqu ymmword ptr [x1_accumh],ymm1
if (ii == 0 && i == 0 && t == 0)
00007FF6AA2E1B83 test rsi,rsi
00007FF6AA2E1B86 jne main+0BD3h (07FF6AA2E1BD3h)
00007FF6AA2E1B88 test r9d,r9d
00007FF6AA2E1B8B jne main+0BD3h (07FF6AA2E1BD3h)
00007FF6AA2E1B8D test edi,edi
00007FF6AA2E1B8F jne main+0BD3h (07FF6AA2E1BD3h)
{
print256_f16(x1_accumh); //uncommenting this fixes decoding???????
00007FF6AA2E1B91 vcvtph2ps zmm0,ymm1
00007FF6AA2E1B97 vmovups zmmword ptr [frac_scalar],zmm0
00007FF6AA2E1B9E xor ebx,ebx
00007FF6AA2E1BA0 vmovss xmm1,dword ptr [rbp+rbx*4+80h]
00007FF6AA2E1BA9 vcvtss2sd xmm1,xmm1,xmm1
00007FF6AA2E1BAD vmovq rdx,xmm1
00007FF6AA2E1BB2 lea rcx,[string "%f\n" (07FF6AA2E42C0h)]
00007FF6AA2E1BB9 vzeroupper
00007FF6AA2E1BBC call printf (07FF6AA2E2C60h)
00007FF6AA2E1BC1 inc rbx
00007FF6AA2E1BC4 cmp rbx,10h
00007FF6AA2E1BC8 jl main+0BA0h (07FF6AA2E1BA0h)
00007FF6AA2E1BCA vmovdqu ymm3,ymmword ptr [x1_accumh]
00007FF6AA2E1BCF mov r9d,dword ptr [rbp]
__m256 x1_accum = _mm256_setzero_ps();
__m256 x2_accum = _mm256_setzero_ps();
__m256h x1_accumh = _mm256_setzero_ph();
for (int t = 0; t < taps_samp[ii] + 1; t++)
00007FF6AA2E1BD3 mov eax,dword ptr taps_samp[rsi*4]
00007FF6AA2E1BDA inc edi
00007FF6AA2E1BDC inc eax
00007FF6AA2E1BDE cmp edi,eax
00007FF6AA2E1BE0 jl main+0B50h (07FF6AA2E1B50h)
00007FF6AA2E1BE6 mov r14,qword ptr [filtered_data]
}
}
}
这是没有打印的优化版本,不起作用:
{
__m128i* startPtr = (__m128i* ) (linedata+ (ii - taps_samp[ii] / 2) * sampleSize);
00007FF64FE11B2B mov ecx,r8d
00007FF64FE11B2E nop
{
if (!usefp16)
{
}
else //fp16 code path
{
//load into an XMM register and then broadcast
__m128i b1 = _mm_loadu_si16((barrayh + count++));
00007FF64FE11B30 movzx eax,word ptr [r13+rdi*2]
00007FF64FE11B36 inc rdi
//take lowest 16 bit value and broadcast
__m256h bh = _mm256_broadcastw_epi16(b1);
//load all 16 channels of int16 RF data
__m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);
00007FF64FE11B39 vlddqu ymm1,ymmword ptr [rdx]
00007FF64FE11B3D lea rdx,[rdx+20h]
00007FF64FE11B41 vmovd xmm0,eax
00007FF64FE11B45 vpbroadcastw ymm2,xmm0
//convert all 16 channels to fp16
__m256h x1hp = _mm256_cvtepi16_ph(x1);
00007FF64FE11B4A vcvtw2ph ymm0,ymm1
//multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh
x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);
00007FF64FE11B50 vfmadd132ph ymm3,ymm2,ymm0 \\ymm3 =accumulator, ymm2 = b coefficients, ymm0 = sample data, so this does accumulator *sample data + b coefficient
00007FF64FE11B56 sub rcx,1
00007FF64FE11B5A jne main+0B30h (07FF64FE11B30h)
00007FF64FE11B5C vmovdqu ymmword ptr [x1_accumh],ymm3
/*if (ii == 0 && i == 0 && t == 0)
{
print256_f16(x1_accumh); //uncommenting this fixes decoding???????
}*/
}
我对 x86 汇编非常陌生,但关键的区别似乎是
的变化vfmadd213ph ymm1,ymm2,ymm3 //ymm1 = rf samples, ymm2 = b coefficient, ymm3 = accumulator, so this does rf samples * b coefficients + accumulator
到
vfmadd132ph ymm3,ymm2,ymm0 \\ymm3 =accumulator, ymm2 = b coefficients, ymm0 = sample data, so this does accumulator *sample data + b coefficient
看起来编译器将 vfmadd213ph 更改为 vfmadd132ph 但没有重新排序参数(b 保留在 ymm2 中),因此它不是将 FIR 抽头系数乘以样本并添加到累加器,而是将累加器乘以样本数据,然后然后将其添加到 FIR 抽头系数中。结果是 Inf,因为这个值很快就会变得巨大。
问题:我是否完全误解了 AVX 编程的工作原理并犯了一些微不足道的代码错误?我能做些什么来解决这个问题吗?这是我的第一个 AVX512 程序,所以我可能做了一些非常愚蠢的事情。
该bug已被微软确认,并在VS 17.7.0中修正,可以正确编译上述代码。