我有一个简单的数学库,可以链接到在模拟器硬件(32位RTOS)上运行的项目中,并且编译器工具链基于GCC 5.5的变体。主要项目代码在Matlab中,但是核心数学运算(数组数据上的cmath函数)用C重新编写以提高性能。从Compiler Explorer看,GCC 5.5 32 bit的优化代码质量似乎不高(供参考:Clang trunk 32bit)。据我了解,Clang在优化循环方面做得更好。一个示例代码片段:
...
void cfunctionsLog10(unsigned int n, const double* x, double* y) {
int i;
for (i = 0; i < n; i++) {
y[i] = log10(x[i]);
}
}
以及由GCC 5.5生成的相应程序集
cfunctionsLog10(unsigned int, double const*, double*):
push ebp
push edi
push esi
push ebx
sub esp, 12
mov esi, DWORD PTR [esp+32]
mov ebp, DWORD PTR [esp+36]
mov edi, DWORD PTR [esp+40]
test esi, esi
je .L28
xor ebx, ebx
.L27:
sub esp, 8
push DWORD PTR [ebp+4+ebx*8]
push DWORD PTR [ebp+0+ebx*8]
call __log10_finite
fstp QWORD PTR [edi+ebx*8]
add ebx, 1
add esp, 16
cmp ebx, esi
jne .L27
.L28:
add esp, 12
pop ebx
pop esi
pop edi
pop ebp
ret
Clang在哪里产生:
cfunctionsLog10(unsigned int, double const*, double*): # @cfunctionsLog10(unsigned int, double const*, double*)
push ebp
push ebx
push edi
push esi
sub esp, 76
mov esi, dword ptr [esp + 96]
test esi, esi
je .LBB2_8
mov edi, dword ptr [esp + 104]
mov ebx, dword ptr [esp + 100]
xor ebp, ebp
cmp esi, 4
jb .LBB2_7
lea eax, [ebx + 8*esi]
cmp eax, edi
jbe .LBB2_4
lea eax, [edi + 8*esi]
cmp eax, ebx
ja .LBB2_7
.LBB2_4:
mov ebp, esi
xor esi, esi
and ebp, -4
.LBB2_5: # =>This Inner Loop Header: Depth=1
vmovsd xmm0, qword ptr [ebx + 8*esi + 16] # xmm0 = mem[0],zero
vmovsd qword ptr [esp], xmm0
vmovsd xmm0, qword ptr [ebx + 8*esi] # xmm0 = mem[0],zero
vmovsd xmm1, qword ptr [ebx + 8*esi + 8] # xmm1 = mem[0],zero
vmovsd qword ptr [esp + 8], xmm0 # 8-byte Spill
vmovsd qword ptr [esp + 16], xmm1 # 8-byte Spill
call log10
fstp tbyte ptr [esp + 64] # 10-byte Folded Spill
vmovsd xmm0, qword ptr [esp + 16] # 8-byte Reload
vmovsd qword ptr [esp], xmm0
call log10
fstp tbyte ptr [esp + 16] # 10-byte Folded Spill
vmovsd xmm0, qword ptr [esp + 8] # 8-byte Reload
vmovsd qword ptr [esp], xmm0
vmovsd xmm0, qword ptr [ebx + 8*esi + 24] # xmm0 = mem[0],zero
vmovsd qword ptr [esp + 8], xmm0 # 8-byte Spill
call log10
vmovsd xmm0, qword ptr [esp + 8] # 8-byte Reload
vmovsd qword ptr [esp], xmm0
fstp qword ptr [esp + 56]
fld tbyte ptr [esp + 16] # 10-byte Folded Reload
fstp qword ptr [esp + 48]
fld tbyte ptr [esp + 64] # 10-byte Folded Reload
fstp qword ptr [esp + 40]
call log10
fstp qword ptr [esp + 32]
vmovsd xmm0, qword ptr [esp + 56] # xmm0 = mem[0],zero
vmovsd xmm1, qword ptr [esp + 40] # xmm1 = mem[0],zero
vmovhps xmm0, xmm0, qword ptr [esp + 48] # xmm0 = xmm0[0,1],mem[0,1]
vmovhps xmm1, xmm1, qword ptr [esp + 32] # xmm1 = xmm1[0,1],mem[0,1]
vmovups xmmword ptr [edi + 8*esi + 16], xmm1
vmovups xmmword ptr [edi + 8*esi], xmm0
add esi, 4
cmp ebp, esi
jne .LBB2_5
mov esi, dword ptr [esp + 96]
cmp ebp, esi
je .LBB2_8
.LBB2_7: # =>This Inner Loop Header: Depth=1
vmovsd xmm0, qword ptr [ebx + 8*ebp] # xmm0 = mem[0],zero
vmovsd qword ptr [esp], xmm0
call log10
fstp qword ptr [edi + 8*ebp]
inc ebp
cmp esi, ebp
jne .LBB2_7
.LBB2_8:
add esp, 76
pop esi
pop edi
pop ebx
pop ebp
ret
由于我无法直接使用Clang,因此使用AVX内部函数重写C源代码是否有任何价值。我认为大多数性能成本来自cmath函数调用,其中大多数没有固有实现。
向量类库具有常用数学函数的内联向量版本,包括log10。