我想用sse来优化c++代码,但是遇到一个函数中有两个双循环的情况,原代码:
double eval_pef(int n, int delta, double mass, double grav, double sep,
double fcon, double *x, double *y, double *z, double *fx,
double *fy, double *fz) {
double pe, rlen, xdiff, ydiff, zdiff, vmag;
int nx, ny, dx, dy,index;
pe = 0.0;
// loop over particles
for (nx = 0; nx < n; nx++) {
for (ny = 0; ny < n; ny++) {
index =nx * n + ny;
fx[index] = 0.0;
fy[index] = 0.0;
fz[index] = -mass * grav;
// loop over displacements
for (dy = MAX(ny - delta, 0); dy < MIN(ny + delta + 1, n); dy++) {
for (dx = MAX(nx - delta, 0); dx < MIN(nx + delta + 1, n); dx++) {
// exclude self interaction
if (nx != dx || ny != dy) {
// compute reference distance
rlen =
sqrt((double)((nx - dx) * (nx - dx) + (ny - dy) * (ny - dy))) *
sep;
// compute actual distance
xdiff = x[dx * n + dy] - x[index];
ydiff = y[dx * n + dy] - y[index];
zdiff = z[dx * n + dy] - z[index];
vmag = sqrt(xdiff * xdiff + ydiff * ydiff + zdiff * zdiff);
// potential energy and force
pe += fcon * (vmag - rlen) * (vmag - rlen);
fx[index] += fcon * xdiff * (vmag - rlen) / vmag;
fy[index] += fcon * ydiff * (vmag - rlen) / vmag;
fz[index] += fcon * zdiff * (vmag - rlen) / vmag;
}
}
}
}
}
return 0.5 * pe;
}
输出:
我只将ny和dy增加2,在原始版本中,rlen可以是1或1.414,并且有16次迭代。而在sse2版本中,rlen仍然是1。新循环一定有问题。帮忙...如何解决?
这是我的 sse 版本:
double eval_pef(int n, int delta, double mass, double grav, double sep,
double fcon, double *x, double *y, double *z, double *fx,
double *fy, double *fz) {
double pe, rlen, xdiff, ydiff, zdiff, vmag;
int nx, ny, dx, dy,index;
__m128d fcon_value = _mm_set1_pd(fcon);
__m128d sep_value = _mm_set1_pd(sep);
pe = 0.0;
// loop over particles
for (nx = 0; nx < n; nx++) {
for (ny = 0; ny < n; ny+=2) {
index =nx * n + ny;
_mm_store_pd(&fx[index],_mm_set1_pd(0.0));
_mm_store_pd(&fy[index],_mm_set1_pd(0.0));
_mm_store_pd(&fz[index],_mm_set1_pd(-mass * grav));
__m128d nx_value = _mm_set1_pd(nx);
__m128d ny_value = _mm_setr_pd(ny,ny+1);
// loop over displacements
for (dx = MAX(nx - delta, 0); dx < MIN(nx + delta + 1, n); dx++) {
for (dy = MAX(ny - delta, 0); dy < MIN(ny + delta + 1, n); dy+=2) {
// exclude self interaction
__m128d dx_value = _mm_set1_pd(dx);
__m128d dy_value = _mm_setr_pd(dy,dy+1);
// compute reference distance
__m128d nx_not_equal_dx = _mm_cmpneq_pd(nx_value, dx_value);
__m128d ny_not_equal_dy = _mm_cmpneq_pd(ny_value, dy_value);
__m128d compare_result = _mm_or_pd(nx_not_equal_dx, ny_not_equal_dy);
__m128d nxdx_diff = _mm_sub_pd(nx_value,dx_value);
__m128d nydy_diff = _mm_sub_pd(ny_value,dy_value);
__m128d add_diff = _mm_add_pd(_mm_mul_sd(nxdx_diff,nxdx_diff),_mm_mul_sd(nydy_diff,nydy_diff));
__m128d rlen_value = _mm_mul_pd(_mm_sqrt_pd(add_diff),sep_value);
// __m128d rlen_value = _mm_set1_pd(sqrt((double)((nx - dx) * (nx - dx) + (ny - dy) * (ny - dy))) *
// sep);
double rlen_arr[2] ;
_mm_store_pd(rlen_arr,rlen_value);
std::cout<<"rlen[0]:" << rlen_value[0]<< '\n';
std::cout<<"rlen[1]:"<<rlen_value[1]<< '\n';
// compute actual distance
__m128d x_value_dxy = _mm_load_pd(&x[dx * n + dy]);
__m128d y_value_dxy = _mm_load_pd(&y[dx * n + dy]);
__m128d z_value_dxy = _mm_load_pd(&z[dx * n + dy]);
__m128d x_value = _mm_load_pd(&x[index]);
__m128d y_value = _mm_load_pd(&y[index]);
__m128d z_value = _mm_load_pd(&z[index]);
__m128d xdiff = _mm_sub_pd(x_value_dxy,x_value);
__m128d ydiff = _mm_sub_pd(y_value_dxy,y_value);
__m128d zdiff = _mm_sub_pd(z_value_dxy,z_value);
__m128d xx_diff = _mm_mul_pd(xdiff,xdiff);
__m128d yy_diff = _mm_mul_pd(ydiff,ydiff);
__m128d zz_diff = _mm_mul_pd(zdiff,zdiff);
__m128d xx_yy_diff = _mm_add_pd(xx_diff,yy_diff);
__m128d xx_yy_zz_diff = _mm_add_pd(xx_yy_diff,zz_diff);
__m128d vmag = _mm_sqrt_pd(xx_yy_zz_diff);
// potential energy and force
__m128d vrdiff = _mm_sub_pd(vmag,rlen_value);
__m128d pe_new = _mm_mul_pd(fcon_value,_mm_mul_pd(vrdiff,vrdiff));
__m128d zero_value = _mm_set1_pd(0.0);
pe_new = _mm_or_pd(_mm_and_pd(compare_result, pe_new),_mm_andnot_pd(compare_result, zero_value));
double result_arr[2] ;
_mm_store_pd(result_arr,pe_new);
pe+=result_arr[0]+result_arr[1];
__m128d fx_values = _mm_load_pd(&fx[index]);
__m128d fy_values = _mm_load_pd(&fy[index]);
__m128d fz_values = _mm_load_pd(&fz[index]);
__m128d factor = _mm_mul_pd(fcon_value,_mm_div_pd(vrdiff,vmag));
__m128d fx_new = _mm_add_pd(fx_values,_mm_mul_pd(factor,xdiff));
__m128d fy_new = _mm_add_pd(fy_values,_mm_mul_pd(factor,ydiff));
__m128d fz_new = _mm_add_pd(fz_values,_mm_mul_pd(factor,zdiff));
fx_new = _mm_or_pd(_mm_and_pd(compare_result, fx_new),_mm_andnot_pd(compare_result, fx_values));
fy_new = _mm_or_pd(_mm_and_pd(compare_result, fy_new),_mm_andnot_pd(compare_result, fy_values));
fz_new = _mm_or_pd(_mm_and_pd(compare_result, fz_new),_mm_andnot_pd(compare_result, fz_values));
_mm_store_pd(&fx[index],fx_new);
_mm_store_pd(&fy[index],fy_new);
_mm_store_pd(&fz[index],fz_new);
//}
}
}
}
}
return 0.5 * pe;
}
很抱歉我的原始代码很难阅读,但我尝试了另一种方法使用 sse2 来优化它。最重要的其实是如何消除if条件分支,而不是sse2的问题。
v_pe = _mm_mul_pd(v_fcon,_mm_mul_pd(v_term, v_term));
_mm_storeu_pd(pe_vals, v_pe);
if (dx == nx && dy == ny){
pe_vals[0] = 0;
}
if ((dx == nx && dy+1 == ny)|| dy + 1 == MIN(ny + delta + 1, n) ){
pe_vals[1] = 0;
}
pe += pe_vals[0] + pe_vals[1];