来自这里.
我编写了一个函数,它在 3D 向量矩阵上运行,计算由红叉表示的每个迭代点的蓝叉表示的向量之间的差异(见图),并累积这些差异的点积自己在一个单精度浮点变量中。
下面是最小的复制器。没有余数循环,因为我总是使用 24 的倍数的大小(参见函数
test_functionality
中的第 2 行)。
#include <iostream>
#include <iomanip>
#include <memory>
#include <random>
#include <limits>
#include <immintrin.h>
//+//////////////////////////////////////////////
// data structures
//+//////////////////////////////////////////////
class v3
{
float data_[3] = {};
public:
float& operator()(const std::size_t index)
{
return data_[index];
}
const float& operator()(const std::size_t index) const
{
return data_[index];
}
v3 operator-(const v3& op) const
{
auto ret_val = v3{};
ret_val(0) = (*this)(0) - op(0);
ret_val(1) = (*this)(1) - op(1);
ret_val(2) = (*this)(2) - op(2);
return ret_val;
}
};
class matrix_v3
{
std::size_t n1_;
std::size_t n2_;
std::unique_ptr<v3> positions_;
public:
matrix_v3(const std::size_t& n1, const std::size_t& n2)
: n1_(n1), n2_(n2), positions_(new v3[n1_ * n2_])
{
if (n1_ < 1 || n2_ < 1)
{
throw std::invalid_argument("matrix_v3_ctor_invalid_sizes_error");
}
}
std::size_t n1() const
{
return n1_;
}
std::size_t n2() const
{
return n2_;
}
v3& position(const std::size_t& i1, const std::size_t& i2)
{
if (n1_ <= i1 || n2_ <= i2)
{
throw std::invalid_argument("matrix_v3_position_invalid_indices_error");
}
return positions_.get()[i1 * n2_ + i2];
}
const v3& position(const std::size_t& i1, const std::size_t& i2) const
{
return const_cast<matrix_v3*>(this)->position(i1, i2);
}
const float* stream(const std::size_t& i1) const
{
if (n1_ <= i1)
{
throw std::invalid_argument("matrix_v3_stream_invalid_index_error");
}
return reinterpret_cast<float*>(positions_.get() + i1 * n2_);
}
};
//+//////////////////////////////////////////////
// functions
//+//////////////////////////////////////////////
float auto_vec_variant(const matrix_v3& mat)
{
// output
auto accumulator = float{};
// loop
for (auto i1 = std::size_t{}; i1 < mat.n1() - 1; ++i1)
{
for (auto i2 = std::size_t{}; i2 < mat.n2(); ++i2)
{
auto lambda = mat.position(i1 + 1, i2) - mat.position(i1, i2);
accumulator += lambda(0) * lambda(0) + lambda(1) * lambda(1) + lambda(2) * lambda(2);
}
}
return accumulator;
}
float intrinsics_variant(const matrix_v3& mat)
{
// inner loop constants
static constexpr auto rstream_increment = 24;
static constexpr auto increment = 8;
const auto rstream_size_div = mat.n2() * 3; // this will always be divisible by 24 in this test!
// packed accumulators
auto accumulator_p_0 = __m256{};
auto accumulator_p_1 = __m256{};
auto accumulator_p_2 = __m256{};
// let's go!
for (auto i1 = std::size_t{}; i1 < mat.n1() - 1; ++i1)
{
// real valued streams
const auto* stream1 = mat.stream(i1);
const auto* stream2 = mat.stream(i1 + 1);
// loop
for (auto index = std::size_t{}; index < rstream_size_div; index += rstream_increment)
{
// load x(i) and x(i)_r
const auto x_0 = _mm256_loadu_ps(stream1 + index);
const auto x_r_0 = _mm256_loadu_ps(stream2 + index);
const auto x_1 = _mm256_loadu_ps(stream1 + index + 8);
const auto x_r_1 = _mm256_loadu_ps(stream2 + index + problem 8);
const auto x_2 = _mm256_loadu_ps(stream1 + index + 16);
const auto x_r_2 = _mm256_loadu_ps(stream2 + index + 16);
// subtraction and FMA
const auto lambda_0 = _mm256_sub_ps(x_r_0, x_0);
const auto lambda_1 = _mm256_sub_ps(x_r_1, x_1);
const auto lambda_2 = _mm256_sub_ps(x_r_2, x_2);
// accumulate in packed register
accumulator_p_0 = _mm256_fmadd_ps(lambda_0, lambda_0, accumulator_p_0);
accumulator_p_1 = _mm256_fmadd_ps(lambda_1, lambda_1, accumulator_p_1);
accumulator_p_2 = _mm256_fmadd_ps(lambda_2, lambda_2, accumulator_p_2);
}
}
// sum up the unrolled accumulators
const auto accumulator_p_intermediate_0 = _mm256_add_ps(accumulator_p_0, accumulator_p_1);
const auto accumulator_p = _mm256_add_ps(accumulator_p_intermediate_0, accumulator_p_2);
// horizontal sum of packed accumulator
const auto high = _mm256_extractf128_ps(accumulator_p, 1);
const auto low = _mm256_castps256_ps128(accumulator_p);
const auto sum_128 = _mm_add_ps(low, high);
const auto sum_128_highx2 = _mm_movehl_ps(sum_128, sum_128);
const auto sum_64 = _mm_add_ps(sum_128, sum_128_highx2);
const auto sum_64_1 = _mm_shuffle_ps(sum_64, sum_64, _MM_SHUFFLE(1,1,1,1));
const auto sum_32 = _mm_add_ps(sum_64, sum_64_1);
auto accumulator = _mm_cvtss_f32(sum_32);
return accumulator;
}
//+//////////////////////////////////////////////
// tester
//+//////////////////////////////////////////////
bool test_functionality()
{
const auto L = std::size_t{30};
const auto N = std::size_t{24*4800}; // at 485 we have 0 error
auto mat = matrix_v3(L, N);
// init
auto engine = std::mt19937_64(std::random_device()());
auto dist = std::uniform_real_distribution<float>(-5.0f, 5.0f);
for (auto i1 = std::size_t{}; i1 < mat.n1(); ++i1)
{
for (auto i2 = std::size_t{}; i2 < mat.n2(); ++i2)
{
auto& position = mat.position(i1, i2);
position(0) = std::floor(dist(engine));
position(1) = std::floor(dist(engine));
position(2) = std::floor(dist(engine));
}
}
// compute reference
auto accumulator_ref = auto_vec_variant(mat);
// compute test
auto accumulator_test = intrinsics_variant(mat);
// test
const auto error = std::fabs(accumulator_ref - accumulator_test);
auto test_passed = true;
if (error > 1E-6)
{
std::cout << "Test failed.";
test_passed = false;
}
else
{
std::cout << "Test passed.";
}
std::cout << " Ref, test, error: " << accumulator_ref << ", " << accumulator_test << ", " << error << "." << std::endl;
return test_passed;
}
int main()
{
// reminder
std::cout << "Starting the test now." << std::endl;
std::cout << std::fixed << std::setprecision(15);
// conduct the test
if (test_functionality())
{
return 0;
}
return 1;
}
问题:
矩阵的大小为
n1 x n2
。当我将n2
设置为大于24*485
的任何值时,与intrinsics_variant
的结果相比,auto_vec_variant
的结果有很大的偏差。 误差不会随着n2
逐渐上升,而是呈指数上升。而且,它总是一个整数。
我尝试使用 GNU 编译器 v11.1.0 和 Intel 编译器 v2021.6.0 编译和运行上述测试,并且上述行为在两种情况下都可重现。我正在使用标志“-O3 -mfma -mavx2”。
此外,以下是扩展输出(最小重现器中未显示的调试结构),包括使用累加器求和的比较
Starting the test now.
Acc 0: 6881928.000000000000000, 6884320.000000000000000, 6883600.000000000000000, 6890775.000000000000000, 6888249.000000000000000, 6910202.000000000000000, 6876891.000000000000000, 6887851.000000000000000
Acc 1: 6885906.000000000000000, 6898564.000000000000000, 6886567.000000000000000, 6890241.000000000000000, 6868671.000000000000000, 6894764.000000000000000, 6888020.000000000000000, 6865337.000000000000000
Acc 2: 6911447.000000000000000, 6919242.000000000000000, 6902696.000000000000000, 6914038.000000000000000, 6877046.000000000000000, 6892467.000000000000000, 6906790.000000000000000, 6883623.000000000000000
Horizontal sum ref, test, difference: 165389200.000000000000000, 165389232.000000000000000, -32.000000000000000
Test failed. Accumulated ref, test, error: 165340880.000000000000000, 165389232.000000000000000, 48352.000000000000000.
Starting the test now.
Acc 0: 6871382.000000000000000, 6870801.000000000000000, 6899278.000000000000000, 6895130.000000000000000, 6898295.000000000000000, 6888785.000000000000000, 6869410.000000000000000, 6909934.000000000000000
Acc 1: 6901459.000000000000000, 6898180.000000000000000, 6911085.000000000000000, 6894278.000000000000000, 6896286.000000000000000, 6883586.000000000000000, 6916996.000000000000000, 6896169.000000000000000
Acc 2: 6917097.000000000000000, 6875026.000000000000000, 6884218.000000000000000, 6895881.000000000000000, 6887871.000000000000000, 6911542.000000000000000, 6892179.000000000000000, 6921255.000000000000000
Horizontal sum ref, test, difference: 165486112.000000000000000, 165486112.000000000000000, 0.000000000000000
Test failed. Accumulated ref, test, error: 165447472.000000000000000, 165486112.000000000000000, 38640.000000000000000.
附加信息: 我有一份几乎相同的双精度代码副本,其中没有出现问题,即测试总是通过。 因此,这似乎只是单精度的问题。
问题:为什么会这样,我们如何解决?
当被问及时,我总是很乐意提供更多信息。非常感谢您的想法和建议。