如何正确进行基准测试

问题描述 投票:0回答:1

我正在尝试根据同一计算的查找表对计算进行基准测试,但我怀疑在发布时,编译器报告的数据不准确。

我将基准测试作为调试,然后作为发布

  • 单个随机点的查找表
  • for 循环中的整个查找表
  • 直接计算
  • 直接计算整个表格 .

'''

Lookup table took 0.1099 seconds to execute

Lookup entire table took 663.521 seconds to execute

Calculation took 0.2698 seconds to execute

Calculation of entire table took 1804.6 seconds to execute

'''

然后我再次使用 -O2 优化对所有版本进行基准测试

Lookup table took 0.0005 seconds to execute

Lookup entire table took 0.0003 seconds to execute

Calculation took 0.098 seconds to execute

Calculation of entire table took 422.066 seconds to execute

调试时,查找表快了 4/5 倍,但在发布时,快了数百倍,所以我怀疑我错误地对直接计算进行了基准测试。 这是直接计算:

igc__blade1 = blade1 >> 1u;
igc__blade1 ^= igc__blade1 >> 1u;
igc__blade1 ^= igc__blade1 >> 2u;
igc__blade1 ^= igc__blade1 >> 4u;

blade2__masked = igc__blade1 & blade2;
//__builtin_popcount replaces commented text
parity__2 = __builtin_popcount(blade2__masked & 0b11111110) & 1u;
/*parity__2 = blade2__masked;
    parity__2 ^= parity__2 >> 1u;
    parity__2 ^= parity__2 >> 2u;
    parity__2 ^= parity__2 >> 4u;
    parity__2 = parity__2 & 2u;*/

sign_of_product = 1 - 2 * parity__2;
//sign_of_product = 1 - parity__2;

这是查找表调用

sign_of_product = lookup_table_sign[row][col];

**据我了解,从 L1 缓存获取查找表大约需要 10 个时钟周期,而我的整个代码使用 15/20 条指令(

XOR
AND
,算术),(我替换了一半带有
popcount
指令)。

所以直接计算,尤其是使用-O2优化编译后,最多应该花费两倍的查表时间,而不是数百次。

我做错了什么,但我不知道是什么。

我怀疑编译器理解for循环仅以最终计算结束,并剪切整个嵌套循环,仅获取最后一个值,但我无法知道发生了什么**

这是完整代码

#include <algorithm>
#include <bitset>
#include <chrono>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <stdint.h>
#ifdef _MSC_VER
#include <intrin.h>
#define __builtin_popcount __popcnt
#endif

using namespace std;

long long REPETITIONS = 10000;

int64_t inverseGrayCode(int64_t n)
{
    n ^= n >> 1;
    n ^= n >> 2;
    n ^= n >> 4;
    n ^= n >> 8;  // unnecessary for 8 bit unsigned integers
    n ^= n >> 16; // unnecessary for 16 bit unsigned integers
    n ^= n >> 32; // unnecessary for 32 bit unsigned integers
    return n;
}

const uint8_t BITS = 6;
const uint8_t SIZE = 0b1 << (BITS);
uint8_t blades[SIZE];
uint8_t igc_blades[SIZE];
uint8_t abs_blades_product[SIZE][SIZE];
uint8_t blades_masked[SIZE][SIZE];
uint8_t parity_2[SIZE][SIZE];
int8_t sign[SIZE][SIZE];
int8_t (*lookup_table_sign)[SIZE];

#pragma region lexicographical
uint64_t _64INDIVIDUAL_BIT[SIZE];
double WEIGHT[SIZE];
void initialize_constants()
{
    uint64_t one = 1;
    for (uint64_t x = 0; x < 64; x++)
    {
        _64INDIVIDUAL_BIT[x] = one << x;
        WEIGHT[x] = pow(2.0, -(1 + (double)x));
    }
}
// Function to calculate the lexicographical weight of an integer
double lexicographical_weight(uint64_t x)
{
    double weight = 0.0;
    for (int i = 0; i < 64; i++)
    {
        weight += ((x & _64INDIVIDUAL_BIT[i]) > 0) * WEIGHT[i];
    }
    return __builtin_popcount(x) + 1 - weight;
}
bool compareByWeight(int element1, int element2)
{
    return lexicographical_weight(element1) < lexicographical_weight(element2);
}

#pragma endregion lexicographical

int8_t (*calc_blade_products(uint8_t blades[]))[SIZE]
{
    for (int i = 0; i < SIZE; i++)
    {
        igc_blades[i] = inverseGrayCode(blades[i] >> 1);
    }

    for (int row = 0; row < SIZE; row++)
    {

        for (int col = 0; col < SIZE; col++)
        {
            abs_blades_product[row][col] = blades[row] ^ blades[col];
            blades_masked[row][col] = igc_blades[row] & blades[col];
            // aux[i][j] = inverseGrayCode(blades_masked[i][j]);
            parity_2[row][col] = inverseGrayCode(blades_masked[row][col]) & 2u;
            sign[row][col] = 1 - parity_2[row][col];
        }
    }
    return sign;
}

uint8_t igc__blade1;
uint8_t blade2__masked;
uint8_t parity__2;
int8_t sign_of_product;
uint8_t blade1;
uint8_t blade2;

int main()
{
    initialize_constants();

#pragma region generate_blades_in_lexicographical_order

    for (unsigned int i = 0; i < SIZE; i++)
    {
        blades[i] = i << 1;
        // fprintf(stdout, "blade %d: %8sb\n", i, bitset<8>(blades[i]).to_string().c_str());
    }
    blades[0] = 1;

    int arraySize = sizeof(blades) / sizeof(blades[0]);
    sort(blades, blades + arraySize, compareByWeight);

#pragma endregion generate_blades_in_lexicographical_order

    // Create lookup_table
    lookup_table_sign = calc_blade_products(blades);

#pragma region benchmark
    uint8_t blade1 = rand() % 256;
    uint8_t blade2 = rand() % 256;

    // benchmark single point with table lookup
    auto start = std::chrono::high_resolution_clock::now();

    for (int rept = 0; rept < REPETITIONS; rept++)
    {
        sign_of_product = lookup_table_sign[blade1][blade2];
    }
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> elapsed_miliseconds = end - start;

    cout << endl;
    cout << "Lookup table took " << elapsed_miliseconds.count() << " seconds to execute" << endl;

    start = std::chrono::high_resolution_clock::now();

    // benchmark entire table with table lookup
    for (int rept = 0; rept < REPETITIONS; rept++)
    {
        for (int row = 0; row < SIZE; row++)
        {
            for (int col = 0; col < SIZE; col++)
            {
                sign_of_product = lookup_table_sign[row][col];
            }
        }
    }
    end = std::chrono::high_resolution_clock::now();
    elapsed_miliseconds = end - start;

    cout << endl;
    cout << "Lookup entire table took " << elapsed_miliseconds.count() << " seconds to execute" << endl;

    // benchmark single point with direct calculation
    start = std::chrono::high_resolution_clock::now();
    for (int rept = 0; rept < REPETITIONS; rept++)
    {
        igc__blade1 = blade1 >> 1u;
        igc__blade1 ^= igc__blade1 >> 1u;
        igc__blade1 ^= igc__blade1 >> 2u;
        igc__blade1 ^= igc__blade1 >> 4u;

        blade2__masked = igc__blade1 & blade2;
        //__builtin_popcount replaces commented text
        parity__2 = __builtin_popcount(blade2__masked & 0b11111110) & 1u;
        /*parity__2 = blade2__masked;
         parity__2 ^= parity__2 >> 1u;
         parity__2 ^= parity__2 >> 2u;
         parity__2 ^= parity__2 >> 4u;
         parity__2 = parity__2 & 2u;*/

        sign_of_product = 1 - 2 * parity__2;
        // sign_of_product = 1 - parity__2;
    }
    end = std::chrono::high_resolution_clock::now();
    elapsed_miliseconds = end - start;

    cout << endl;
    cout << "Calculation took " << elapsed_miliseconds.count() << " seconds to execute" << endl;

    // benchmark entire table with direct calculation
    start = std::chrono::high_resolution_clock::now();
    for (int rept = 0; rept < REPETITIONS; rept++)
    {
        for (int blade1 = 0; blade1 < SIZE; blade1++)
        {
            for (int blade2 = 0; blade2 < SIZE; blade2++)
            {
                igc__blade1 = blade1 >> 1u;
                igc__blade1 ^= igc__blade1 >> 1u;
                igc__blade1 ^= igc__blade1 >> 2u;
                igc__blade1 ^= igc__blade1 >> 4u;

                blade2__masked = igc__blade1 & blade2;
                //__builtin_popcount replaces commented text
                parity__2 = __builtin_popcount(blade2__masked & 0b11111110) & 1u;
                /*parity__2 = blade2__masked;
                    parity__2 ^= parity__2 >> 1u;
                    parity__2 ^= parity__2 >> 2u;
                    parity__2 ^= parity__2 >> 4u;
                    parity__2 = parity__2 & 2u;*/

                sign_of_product = 1 - 2 * parity__2;
                // sign_of_product = 1 - parity__2;
            }
        }
    }
    end = std::chrono::high_resolution_clock::now();
    elapsed_miliseconds = end - start;

    cout << endl;
    cout << "Calculation of entire table took " << elapsed_miliseconds.count() << " seconds to execute" << endl;

#pragma endregion benchmark

    return 0;
}
c++ visual-studio optimization benchmarking
1个回答
0
投票

在“Peter Cordes”评论之后,我将获得计算结果的变量标记为

volatile
,以阻止编译器忽略它们并跳转到循环的最后一个值。

它得到了更合理的结果,其中计算成本大约是表查找的两倍,尽管我无法知道幕后发生了什么,以及两种选择之间 CPU 时间的真正差异是什么。

volatile int8_t sign_of_product;

''' (通过矿石重复释放构建输出)

Lookup table took 0.2369 seconds to execute

Lookup entire table took 1526.54 seconds to execute

Calculation took 0.5109 seconds to execute

Calculation of entire table took 3309.76 seconds to execute

'''

© www.soinside.com 2019 - 2024. All rights reserved.