使用 AVX2 内在函数的单精度 3D 矢量点积累积中的不稳定误差

Question

我编写了一个函数，它在 3D 向量矩阵上运行，计算由红叉表示的每个迭代点的蓝叉表示的向量之间的差异（见图），并累积这些差异的点积自己在一个单精度浮点变量中。

下面是最小的复制器。没有余数循环，因为我总是使用 24 的倍数的大小（参见函数

test_functionality

中的第 2 行）。

#include <iostream>
#include <iomanip>
#include <memory>
#include <random>
#include <limits>

#include <immintrin.h>

//+//////////////////////////////////////////////
// data structures
//+//////////////////////////////////////////////

class v3
{
    float data_[3] = {};
    
public:
    
    float& operator()(const std::size_t index)
    {
        return data_[index];
    }
    
    const float& operator()(const std::size_t index) const
    {
        return data_[index];
    }
    
    v3 operator-(const v3& op) const
    {
        auto ret_val = v3{};
        ret_val(0) = (*this)(0) - op(0);
        ret_val(1) = (*this)(1) - op(1);
        ret_val(2) = (*this)(2) - op(2);
        
        return ret_val;
    }
};

class matrix_v3
{
    std::size_t n1_;
    std::size_t n2_;
    std::unique_ptr<v3> positions_;
    
public:
    
    matrix_v3(const std::size_t& n1, const std::size_t& n2)
        : n1_(n1), n2_(n2), positions_(new v3[n1_ * n2_])
    {
        if (n1_ < 1 || n2_ < 1)
        {
            throw std::invalid_argument("matrix_v3_ctor_invalid_sizes_error");
        }
    }
    
    std::size_t n1() const
    {
        return n1_;
    }
    
    std::size_t n2() const
    {
        return n2_;
    }
    
    v3& position(const std::size_t& i1, const std::size_t& i2)
    {
        if (n1_ <= i1 || n2_ <= i2)
        {
            throw std::invalid_argument("matrix_v3_position_invalid_indices_error");
        }
        return positions_.get()[i1 * n2_ + i2];
    }
    
    const v3& position(const std::size_t& i1, const std::size_t& i2) const
    {
        return const_cast<matrix_v3*>(this)->position(i1, i2);
    }
    
    const float* stream(const std::size_t& i1) const
    {
        if (n1_ <= i1)
        {
            throw std::invalid_argument("matrix_v3_stream_invalid_index_error");
        }
        return reinterpret_cast<float*>(positions_.get() + i1 * n2_);
    }
};

//+//////////////////////////////////////////////
// functions
//+//////////////////////////////////////////////

float auto_vec_variant(const matrix_v3& mat)
{
    // output
    auto accumulator = float{};
    
    // loop
    for (auto i1 = std::size_t{}; i1 < mat.n1() - 1; ++i1)
    {
        for (auto i2 = std::size_t{}; i2 < mat.n2(); ++i2)
        {
            auto lambda = mat.position(i1 + 1, i2) - mat.position(i1, i2);
            accumulator += lambda(0) * lambda(0) + lambda(1) * lambda(1) + lambda(2) * lambda(2);
        }
    }
    
    return accumulator;
}

float intrinsics_variant(const matrix_v3& mat)
{
    // inner loop constants
    static constexpr auto rstream_increment = 24;
    static constexpr auto increment = 8;
    const auto rstream_size_div = mat.n2() * 3; // this will always be divisible by 24 in this test!
    
    // packed accumulators
    auto accumulator_p_0 = __m256{};
    auto accumulator_p_1 = __m256{};
    auto accumulator_p_2 = __m256{};
    
    // let's go!
    for (auto i1 = std::size_t{}; i1 < mat.n1() - 1; ++i1)
    {
        // real valued streams
        const auto* stream1 = mat.stream(i1);
        const auto* stream2 = mat.stream(i1 + 1);
        
        // loop
        for (auto index = std::size_t{}; index < rstream_size_div; index += rstream_increment)
        {
            // load x(i) and x(i)_r
            const auto x_0 = _mm256_loadu_ps(stream1 + index);
            const auto x_r_0 = _mm256_loadu_ps(stream2 + index);
            
            const auto x_1 = _mm256_loadu_ps(stream1 + index + 8);
            const auto x_r_1 = _mm256_loadu_ps(stream2 + index + problem 8);
            
            const auto x_2 = _mm256_loadu_ps(stream1 + index + 16);
            const auto x_r_2 = _mm256_loadu_ps(stream2 + index + 16);
            
            // subtraction and FMA
            const auto lambda_0 = _mm256_sub_ps(x_r_0, x_0);
            const auto lambda_1 = _mm256_sub_ps(x_r_1, x_1);
            const auto lambda_2 = _mm256_sub_ps(x_r_2, x_2);
            
            // accumulate in packed register
            accumulator_p_0 = _mm256_fmadd_ps(lambda_0, lambda_0, accumulator_p_0);
            accumulator_p_1 = _mm256_fmadd_ps(lambda_1, lambda_1, accumulator_p_1);
            accumulator_p_2 = _mm256_fmadd_ps(lambda_2, lambda_2, accumulator_p_2);
        }
    }
    
    // sum up the unrolled accumulators
    const auto accumulator_p_intermediate_0 = _mm256_add_ps(accumulator_p_0, accumulator_p_1);
    const auto accumulator_p = _mm256_add_ps(accumulator_p_intermediate_0, accumulator_p_2);
    
    // horizontal sum of packed accumulator
    const auto high = _mm256_extractf128_ps(accumulator_p, 1);
    const auto low = _mm256_castps256_ps128(accumulator_p);
    const auto sum_128 = _mm_add_ps(low, high);
    const auto sum_128_highx2 = _mm_movehl_ps(sum_128, sum_128);
    const auto sum_64 = _mm_add_ps(sum_128, sum_128_highx2);
    const auto sum_64_1 = _mm_shuffle_ps(sum_64, sum_64, _MM_SHUFFLE(1,1,1,1));
    const auto sum_32 = _mm_add_ps(sum_64, sum_64_1);
    auto accumulator = _mm_cvtss_f32(sum_32);
    
    return accumulator;
}

//+//////////////////////////////////////////////
// tester
//+//////////////////////////////////////////////

bool test_functionality()
{
    const auto L = std::size_t{30};
    const auto N = std::size_t{24*4800}; // at 485 we have 0 error
    auto mat = matrix_v3(L, N);
    
    // init
    auto engine = std::mt19937_64(std::random_device()());
    auto dist = std::uniform_real_distribution<float>(-5.0f, 5.0f);
    for (auto i1 = std::size_t{}; i1 < mat.n1(); ++i1)
    {
        for (auto i2 = std::size_t{}; i2 < mat.n2(); ++i2)
        {
            auto& position = mat.position(i1, i2);
            position(0) = std::floor(dist(engine));
            position(1) = std::floor(dist(engine));
            position(2) = std::floor(dist(engine));
        }
    }
    
    // compute reference
    auto accumulator_ref = auto_vec_variant(mat);
    
    // compute test
    auto accumulator_test = intrinsics_variant(mat);
    
    // test
    const auto error = std::fabs(accumulator_ref - accumulator_test);
    auto test_passed = true;
    if (error > 1E-6)
    {
        std::cout << "Test failed.";
        test_passed = false;
    }
    else
    {
        std::cout << "Test passed.";
    }
    std::cout << " Ref, test, error: " << accumulator_ref << ", " << accumulator_test << ", " << error << "." << std::endl;
    
    return test_passed;
}

int main()
{
    // reminder
    std::cout << "Starting the test now." << std::endl;
    std::cout << std::fixed << std::setprecision(15);
    
    // conduct the test
    if (test_functionality())
    {
        return 0;
    }
    
    return 1;
}

问题：

矩阵的大小为

n1 x n2

。当我将

n2

设置为大于

24*485

的任何值时，与

intrinsics_variant

的结果相比，

auto_vec_variant

的结果有很大的偏差。误差不会随着
n2
逐渐上升，而是呈指数上升。而且，它总是一个整数。

我尝试使用 GNU 编译器 v11.1.0 和 Intel 编译器 v2021.6.0 编译和运行上述测试，并且上述行为在两种情况下都可重现。我正在使用标志“-O3 -mfma -mavx2”。

此外，以下是扩展输出（最小重现器中未显示的调试结构），包括使用累加器求和的比较

简单的C代码
使用 AVX2 内在函数的水平总和 here:

Starting the test now.
Acc 0: 6881928.000000000000000, 6884320.000000000000000, 6883600.000000000000000, 6890775.000000000000000, 6888249.000000000000000, 6910202.000000000000000, 6876891.000000000000000, 6887851.000000000000000
Acc 1: 6885906.000000000000000, 6898564.000000000000000, 6886567.000000000000000, 6890241.000000000000000, 6868671.000000000000000, 6894764.000000000000000, 6888020.000000000000000, 6865337.000000000000000
Acc 2: 6911447.000000000000000, 6919242.000000000000000, 6902696.000000000000000, 6914038.000000000000000, 6877046.000000000000000, 6892467.000000000000000, 6906790.000000000000000, 6883623.000000000000000
Horizontal sum ref, test, difference: 165389200.000000000000000, 165389232.000000000000000, -32.000000000000000
Test failed. Accumulated ref, test, error: 165340880.000000000000000, 165389232.000000000000000, 48352.000000000000000.

Starting the test now.
Acc 0: 6871382.000000000000000, 6870801.000000000000000, 6899278.000000000000000, 6895130.000000000000000, 6898295.000000000000000, 6888785.000000000000000, 6869410.000000000000000, 6909934.000000000000000
Acc 1: 6901459.000000000000000, 6898180.000000000000000, 6911085.000000000000000, 6894278.000000000000000, 6896286.000000000000000, 6883586.000000000000000, 6916996.000000000000000, 6896169.000000000000000
Acc 2: 6917097.000000000000000, 6875026.000000000000000, 6884218.000000000000000, 6895881.000000000000000, 6887871.000000000000000, 6911542.000000000000000, 6892179.000000000000000, 6921255.000000000000000
Horizontal sum ref, test, difference: 165486112.000000000000000, 165486112.000000000000000, 0.000000000000000
Test failed. Accumulated ref, test, error: 165447472.000000000000000, 165486112.000000000000000, 38640.000000000000000.

附加信息： 我有一份几乎相同的双精度代码副本，其中没有出现问题，即测试总是通过。 因此，这似乎只是单精度的问题。

问题：为什么会这样，我们如何解决？

当被问及时，我总是很乐意提供更多信息。非常感谢您的想法和建议。

使用 AVX2 内在函数的单精度 3D 矢量点积累积中的不稳定误差

问题描述投票：0回答：0

最新问题

使用 AVX2 内在函数的单精度 3D 矢量点积累积中的不稳定误差

问题描述 投票：0回答：0

最新问题

问题描述投票：0回答：0