此代码由.NET Core 3.0 JIT为我的手动矢量化C#代码生成:
00007FFE6C7D2103 vmovdqu xmm5,xmmword ptr [rcx]
00007FFE6C7D2107 vmovdqu xmm6,xmmword ptr [rcx+10h]
00007FFE6C7D210C vmovdqu xmm7,xmmword ptr [rcx+20h]
00007FFE6C7D2111 vmovdqu xmm8,xmmword ptr [rcx+30h]
00007FFE6C7D2116 vpand xmm9,xmm5,xmm0
00007FFE6C7D211A vpand xmm10,xmm6,xmm0
00007FFE6C7D211E vpackusdw xmm9,xmm9,xmm10
00007FFE6C7D2123 vpslldq xmm9,xmm9,1
00007FFE6C7D2129 vpand xmm10,xmm5,xmm1
00007FFE6C7D212D vpand xmm11,xmm6,xmm1
00007FFE6C7D2131 vpackusdw xmm10,xmm10,xmm11
00007FFE6C7D2136 vpsrldq xmm5,xmm5,1
00007FFE6C7D213B vpsrldq xmm6,xmm6,1
00007FFE6C7D2140 vpand xmm5,xmm5,xmm1
00007FFE6C7D2144 vpand xmm6,xmm6,xmm1
00007FFE6C7D2148 vpackusdw xmm5,xmm5,xmm6
var low = brightness( r, g, b, redMul, greenMul, blueMul );
00007FFE6C7D214D vpmulhuw xmm9,xmm9,xmm2
00007FFE6C7D2151 vpmulhuw xmm10,xmm10,xmm3
00007FFE6C7D2155 vpmulhuw xmm5,xmm5,xmm4
00007FFE6C7D2159 vpaddusw xmm6,xmm9,xmm10
00007FFE6C7D215E vpaddusw xmm5,xmm6,xmm5
00007FFE6C7D2162 vpsrlw xmm5,xmm5,8
00007FFE6C7D2167 vpand xmm6,xmm7,xmm0
00007FFE6C7D216B vpand xmm9,xmm8,xmm0
00007FFE6C7D216F vpackusdw xmm6,xmm6,xmm9
00007FFE6C7D2174 vpslldq xmm9,xmm6,1
00007FFE6C7D2179 vpand xmm6,xmm7,xmm1
00007FFE6C7D217D vpand xmm10,xmm8,xmm1
00007FFE6C7D2181 vpackusdw xmm10,xmm6,xmm10
00007FFE6C7D2186 vpsrldq xmm6,xmm7,1
00007FFE6C7D218B vpsrldq xmm7,xmm8,1
00007FFE6C7D2191 vpand xmm6,xmm6,xmm1
00007FFE6C7D2195 vpand xmm7,xmm7,xmm1
00007FFE6C7D2199 vpackusdw xmm6,xmm6,xmm7
var hi = brightness( r, g, b, redMul, greenMul, blueMul );
00007FFE6C7D219E vpmulhuw xmm7,xmm9,xmm2
00007FFE6C7D21A2 vpmulhuw xmm8,xmm10,xmm3
00007FFE6C7D21A6 vpmulhuw xmm6,xmm6,xmm4
00007FFE6C7D21AA vpaddusw xmm7,xmm7,xmm8
00007FFE6C7D21AF vpaddusw xmm6,xmm7,xmm6
00007FFE6C7D21B3 vpsrlw xmm6,xmm6,8
00007FFE6C7D21B8 vpackuswb xmm5,xmm5,xmm6
Sse2.Store( dst, bytes );
00007FFE6C7D21BC vmovdqu xmmword ptr [rdx],xmm5
src += 64;
00007FFE6C7D21C0 add rcx,40h
dst += 16;
00007FFE6C7D21C4 add rdx,10h
while( src < srcEnd )
00007FFE6C7D21C8 cmp rcx,rax
00007FFE6C7D21CB jb 00007FFE6C7D2103
此代码由VC ++ 2015生成,当编译我的手动矢量化C ++时。
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD11C0 vmovdqu xmm6,xmmword ptr [rcx-10h]
00007FF735AD11C5 vmovdqu xmm7,xmmword ptr [rcx-20h]
loadRgb( src + 2, r, g, b );
00007FF735AD11CA vmovdqu xmm9,xmmword ptr [rcx]
00007FF735AD11CE vmovdqu xmm8,xmmword ptr [rcx+10h]
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD11D3 vpand xmm3,xmm10,xmm6
00007FF735AD11D7 vpand xmm1,xmm11,xmm6
00007FF735AD11DB vpand xmm0,xmm11,xmm7
00007FF735AD11DF vpackusdw xmm1,xmm0,xmm1
00007FF735AD11E4 vpslldq xmm2,xmm1,1
const auto low = brightness( r, g, b );
00007FF735AD11E9 vpmulhuw xmm4,xmm2,xmm12
00007FF735AD11EE vpand xmm0,xmm10,xmm7
00007FF735AD11F2 vpackusdw xmm1,xmm0,xmm3
const auto low = brightness( r, g, b );
00007FF735AD11F7 vpmulhuw xmm2,xmm1,xmm13
00007FF735AD11FC vpaddusw xmm5,xmm4,xmm2
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD1200 vpsrldq xmm0,xmm6,1
00007FF735AD1205 vpand xmm3,xmm0,xmm10
00007FF735AD120A vpsrldq xmm1,xmm7,1
00007FF735AD120F vpand xmm2,xmm1,xmm10
00007FF735AD1214 vpackusdw xmm0,xmm2,xmm3
const auto low = brightness( r, g, b );
00007FF735AD1219 vpmulhuw xmm3,xmm0,xmm14
00007FF735AD121E vpaddusw xmm1,xmm5,xmm3
00007FF735AD1222 vpsrlw xmm6,xmm1,8
loadRgb( src + 2, r, g, b );
00007FF735AD1227 vpand xmm2,xmm11,xmm8
00007FF735AD122C vpand xmm0,xmm11,xmm9
00007FF735AD1231 vpackusdw xmm1,xmm0,xmm2
00007FF735AD1236 vpslldq xmm2,xmm1,1
const auto hi = brightness( r, g, b );
00007FF735AD123B vpmulhuw xmm4,xmm2,xmm12
loadRgb( src + 2, r, g, b );
00007FF735AD1240 vpand xmm0,xmm10,xmm9
00007FF735AD1245 vpand xmm3,xmm10,xmm8
00007FF735AD124A vpackusdw xmm1,xmm0,xmm3
const auto hi = brightness( r, g, b );
00007FF735AD124F vpmulhuw xmm2,xmm1,xmm13
00007FF735AD1254 vpaddusw xmm5,xmm4,xmm2
loadRgb( src + 2, r, g, b );
00007FF735AD1258 vpsrldq xmm1,xmm9,1
00007FF735AD125E vpand xmm2,xmm1,xmm10
00007FF735AD1263 vpsrldq xmm0,xmm8,1
00007FF735AD1269 vpand xmm3,xmm0,xmm10
00007FF735AD126E vpackusdw xmm0,xmm2,xmm3
const auto hi = brightness( r, g, b );
00007FF735AD1273 vpmulhuw xmm3,xmm0,xmm14
00007FF735AD1278 vpaddusw xmm1,xmm5,xmm3
00007FF735AD127C vpsrlw xmm2,xmm1,8
src += 4;
00007FF735AD1281 lea rcx,[rcx+40h]
const auto bytes = packus_epi16( low, hi );
00007FF735AD1285 vpackuswb xmm0,xmm6,xmm2
VecInteger* dest = (VecInteger*)destinationBytes;
while( src < srcEnd )
00007FF735AD1289 lea rax,[rcx-20h]
storeu_all( dest, bytes );
00007FF735AD128D vmovdqu xmmword ptr [rdx],xmm0
dest++;
00007FF735AD1291 lea rdx,[rdx+10h]
00007FF735AD1295 cmp rax,r8
00007FF735AD1298 jb Sse::convertToGrayscale+80h (07FF735AD11C0h)
以上两个代码段仅包含程序的主循环。如您所见,它们的指令几乎相同,但是C#的速度是C ++的两倍。
具体来说,当用511M像素进行测试时,在我的PC(AMD Ryzen 5 3600)C ++代码上的结果需要221 ms,C#代码需要410 ms。
为什么?
请参阅Why is C# twice as slow as C++ even though the generated machine code is nearly identical?以获取C#源。
C ++源代码:https://github.com/Const-me/IntelIntrinsics/blob/master/CppDemo/brightness.cpp https://github.com/Const-me/IntelIntrinsics/blob/master/CppDemo/brightness.inl
这是我测量了3次而不是1次(对于5.11亿像素)后,测试应用程序打印的内容:
#1 391.1885 ms, #2 216.985 ms, #3 235.5549 ms
源代码:https://gist.github.com/Const-me/0f0c283a0b998aa9977550d85fa33958这些〜220 ms非常接近等效C ++代码的性能。因此,C#SIMD毕竟还算不错。