我有以下方法计算给定数组的点积:
static double DotProduct(uint[] vecA, uint[] vecB)
{
double dotProduct = 0;
for (var i = 0; i < vecA.Length; i++)
{
dotProduct += (vecA[i] * vecB[i]);
}
return dotProduct;
}
我想对此进行改进并利用 SIMD 和 AVX。我尝试对其进行向量化,如下所示,会引发 ArgumentOutOfRange 异常:
static double DotProduct(uint[] vecA, uint[] vecB)
{
Vector<uint> vec1 = new Vector<uint>(vecA);
Vector<uint> vec2 = new Vector<uint>(vecB);
return Vector.Dot(vec1, vec2);
}
我错过了什么?
您将需要更多代码来矢量化该函数。这是一种可能的实现,需要 FMA3 运行时支持。
/// <summary>Assuming the numbers don't exceed int.MaxValue, convert uint vector to FP64</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static unsafe Vector256<double> upcast( Vector128<int> iv )
{
#if DEBUG
Debug.Assert( Sse41.TestZ( iv, Vector128.Create( unchecked((int)0x80000000) ) ) );
#endif
return Avx.ConvertToVector256Double( iv );
}
/// <summary>Load 4 uint numbers and upcast to FP64</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static unsafe Vector256<double> load4( uint* rsi ) =>
upcast( Vector128.Load( (int*)rsi ) );
/// <summary>Make permutation vector for <c>vpshufb</c> to load remainder from the arrays</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static unsafe Vector128<byte> makeRemainderPermute( int maskLength )
{
Debug.Assert( maskLength >= 0 && maskLength < 4 );
// https://vcsjones.dev/csharp-readonly-span-bytes-static/
const byte n = 0x80;
ReadOnlySpan<byte> remainderShuffleBuffer =
[
// Identity shuffle i.e. [a,b,c,d] -> [a,b,c,d]
0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15,
// [a,b,c,d] -> [ b, c, d, 0 ]
4,5,6,7, 8,9,10,11, 12,13,14,15, n,n,n,n,
// [a,b,c,d] -> [ c, d, 0, 0 ]
8,9,10,11, 12,13,14,15, n,n,n,n, n,n,n,n,
// [a,b,c,d] -> [ d, 0, 0, 0 ]
12,13,14,15, n,n,n,n, n,n,n,n, n,n,n,n,
];
fixed( byte* rsi = remainderShuffleBuffer )
return Vector128.Load( rsi + maskLength * 16 );
}
/// <summary>Same as <see cref="load4"/>, apply permutation on the bytes before upcasting to FP64</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static unsafe Vector256<double> load4Rem( uint* rsi, Vector128<byte> perm )
{
Vector128<byte> bytes = Vector128.Load( (byte*)rsi );
Vector128<int> iv = Ssse3.Shuffle( bytes, perm ).As<byte, int>();
return upcast( iv );
}
/// <summary>Horizontal sum of FP64 AVX vector</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static double hadd( Vector256<double> v )
{
Vector128<double> v2 = Sse2.Add( v.GetUpper(), v.GetLower() );
return Sse2.UnpackHigh( v2, v2 ).ToScalar() + v2.ToScalar();
}
/// <summary>Compute dot product of two vectors in FP64 precision</summary>
[MethodImpl( MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization )]
public static double dot( uint[] vecA, uint[] vecB )
{
int length = vecA.Length;
if( length != vecB.Length )
throw new ArgumentException();
if( length >= 4 )
{
Vector256<double> acc = Vector256<double>.Zero;
unsafe
{
fixed( uint* aFixt = vecA )
fixed( uint* bFixt = vecB )
{
uint* pa = aFixt;
uint* pb = bFixt;
uint* aLast = pa + ( length - 4 );
uint* bLast = pb + ( length - 4 );
for( ; pa < aLast; pa += 4, pb += 4 )
{
Vector256<double> a = load4( pa );
Vector256<double> b = load4( pb );
acc = Fma.MultiplyAdd( a, b, acc );
}
// Handle the remainder
{
Vector128<byte> perm = makeRemainderPermute( unchecked((int)( pa - aLast )) );
Vector256<double> a = load4Rem( aLast, perm );
Vector256<double> b = load4Rem( bLast, perm );
acc = Fma.MultiplyAdd( a, b, acc );
}
}
return hadd( acc );
}
}
else
{
double dotProduct = 0;
for( var i = 0; i < vecA.Length; i++ )
dotProduct += ( vecA[ i ] * vecB[ i ] );
return dotProduct;
}
}
通过使用多个累加器可以进一步提高性能。有关 C++ 中 FP32 点积函数的示例,请参阅那个答案。