如何向量化 2 个数组的 DotProduct 计算?

问题描述 投票:0回答:1

我有以下方法计算给定数组的点积:

static double DotProduct(uint[] vecA, uint[] vecB)
{
    double dotProduct = 0;
    for (var i = 0; i < vecA.Length; i++)
    {
        dotProduct += (vecA[i] * vecB[i]);
    }
    return dotProduct;
}

我想对此进行改进并利用 SIMD 和 AVX。我尝试对其进行向量化,如下所示,会引发 ArgumentOutOfRange 异常:

static double DotProduct(uint[] vecA, uint[] vecB)
{
    Vector<uint> vec1 = new Vector<uint>(vecA);
    Vector<uint> vec2 = new Vector<uint>(vecB);

    return Vector.Dot(vec1, vec2);
}

我错过了什么?

c# .net simd
1个回答
0
投票

您将需要更多代码来矢量化该函数。这是一种可能的实现,需要 FMA3 运行时支持。

/// <summary>Assuming the numbers don't exceed int.MaxValue, convert uint vector to FP64</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static unsafe Vector256<double> upcast( Vector128<int> iv )
{
#if DEBUG
    Debug.Assert( Sse41.TestZ( iv, Vector128.Create( unchecked((int)0x80000000) ) ) );
#endif
    return Avx.ConvertToVector256Double( iv );
}

/// <summary>Load 4 uint numbers and upcast to FP64</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static unsafe Vector256<double> load4( uint* rsi ) =>
    upcast( Vector128.Load( (int*)rsi ) );

/// <summary>Make permutation vector for <c>vpshufb</c> to load remainder from the arrays</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static unsafe Vector128<byte> makeRemainderPermute( int maskLength )
{
    Debug.Assert( maskLength >= 0 && maskLength < 4 );

    // https://vcsjones.dev/csharp-readonly-span-bytes-static/
    const byte n = 0x80;
    ReadOnlySpan<byte> remainderShuffleBuffer =
    [
        // Identity shuffle i.e. [a,b,c,d] -> [a,b,c,d]
        0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15,
        // [a,b,c,d] -> [ b, c, d, 0 ]
        4,5,6,7, 8,9,10,11, 12,13,14,15, n,n,n,n,
        // [a,b,c,d] -> [ c, d, 0, 0 ]
        8,9,10,11, 12,13,14,15, n,n,n,n, n,n,n,n,
        // [a,b,c,d] -> [ d, 0, 0, 0 ]
        12,13,14,15, n,n,n,n, n,n,n,n, n,n,n,n,
    ];

    fixed( byte* rsi = remainderShuffleBuffer )
        return Vector128.Load( rsi + maskLength * 16 );
}

/// <summary>Same as <see cref="load4"/>, apply permutation on the bytes before upcasting to FP64</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static unsafe Vector256<double> load4Rem( uint* rsi, Vector128<byte> perm )
{
    Vector128<byte> bytes = Vector128.Load( (byte*)rsi );
    Vector128<int> iv = Ssse3.Shuffle( bytes, perm ).As<byte, int>();
    return upcast( iv );
}

/// <summary>Horizontal sum of FP64 AVX vector</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static double hadd( Vector256<double> v )
{
    Vector128<double> v2 = Sse2.Add( v.GetUpper(), v.GetLower() );
    return Sse2.UnpackHigh( v2, v2 ).ToScalar() + v2.ToScalar();
}

/// <summary>Compute dot product of two vectors in FP64 precision</summary>
[MethodImpl( MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization )]
public static double dot( uint[] vecA, uint[] vecB )
{
    int length = vecA.Length;
    if( length != vecB.Length )
        throw new ArgumentException();

    if( length >= 4 )
    {
        Vector256<double> acc = Vector256<double>.Zero;
        unsafe
        {
            fixed( uint* aFixt = vecA )
            fixed( uint* bFixt = vecB )
            {
                uint* pa = aFixt;
                uint* pb = bFixt;
                uint* aLast = pa + ( length - 4 );
                uint* bLast = pb + ( length - 4 );

                for( ; pa < aLast; pa += 4, pb += 4 )
                {
                    Vector256<double> a = load4( pa );
                    Vector256<double> b = load4( pb );
                    acc = Fma.MultiplyAdd( a, b, acc );
                }

                // Handle the remainder
                {
                    Vector128<byte> perm = makeRemainderPermute( unchecked((int)( pa - aLast )) );
                    Vector256<double> a = load4Rem( aLast, perm );
                    Vector256<double> b = load4Rem( bLast, perm );
                    acc = Fma.MultiplyAdd( a, b, acc );
                }
            }
            return hadd( acc );
        }
    }
    else
    {
        double dotProduct = 0;
        for( var i = 0; i < vecA.Length; i++ )
            dotProduct += ( vecA[ i ] * vecB[ i ] );
        return dotProduct;
    }
}

通过使用多个累加器可以进一步提高性能。有关 C++ 中 FP32 点积函数的示例,请参阅那个答案

© www.soinside.com 2019 - 2024. All rights reserved.