我想学习SIMD编程。现在,我的代码中有了一些有趣的时刻。我只想衡量代码的工作时间。我尝试对具有特定大小的数组应用一些基本函数。首先,我尝试使用用SIMD指令编写的函数,然后尝试使用通常的方法。而且我比较这两个实现相同功能的时间。但是,当我的尺寸为8时,我的性能为1.3,而当我的尺寸为512-我的性能为3,如果我的尺寸= 1000,性能= 4,如果尺寸= 4000->性能= 5,则不会了解为什么数组大小增加时我的性能会增加。
我的代码
template <typename clock_t>
class Timer {
public:
using mcs_t = std::chrono::microseconds;
using ns_t = std::chrono::nanoseconds;
Timer();
void restart();
double toc();
private:
typename clock_t::time_point m_start;
typename clock_t::time_point m_stop;
};
using TimerHc = Timer<std::chrono::high_resolution_clock>;
template<typename clock_t>
Timer<clock_t>::Timer(){
restart();
}
template<typename clock_t>
void Timer<clock_t>::restart(){
m_start = clock_t::now();
}
template<typename clock_t>
double Timer<clock_t>::toc(){
return double(toc<mcs_t>()) / 1000000;
}
void init(double *v, size_t size) {
for (int i = 0; i < size; ++i) {
v[i] = i / 10.0;
}
}
void sub_func_sse(double *v, int start_idx) {
__m256d vector = _mm256_loadu_pd(v + start_idx);
__m256d base = _mm256_set_pd(2.0, 2.0, 2.0, 2.0);
for (int i = 0; i < 128; ++i) {
vector = _mm256_mul_pd(vector, base);
}
_mm256_storeu_pd(v + start_idx, vector);
}
void sub_func(double &item) {
for (int k = 0; k < 128; ++k) {
item *= 2.0;
}
}
int main() {
const size_t size = 8;
double *v = new double[size];
init(v, size);
const int num_repeat = 2000;//I should repeat my measuraments
//because I want to get average time - it is more clear information
double total_time_sse = 0;
for (int p = 0; p < num_repeat; ++p) {
init(v, size);
gs::TimerHc t;
t.restart();
for (int i = 0; i < size; i += 8) {
sub_func_sse(v, i);
}
total_time_sse += t.toc<gs::TimerHc::ns_t>();
}
double total_time = 0;
for (int p = 0; p < num_repeat; ++p) {
init(v, size);
gs::TimerHc t;
t.restart();
for (int i = 0; i < size; ++i) {
sub_func(v[i]);
}
total_time+= t.toc<gs::TimerHc::ns_t>();
}
cout << "time using sse = " << total_time_sse / num_repeat << endl <<
"time without sse = " << total_time / num_repeat << endl;
system("pause");
}