SSE如何修复c++中双循环的条件

问题描述 投票:0回答:1

我想用sse来优化c++代码,但是遇到一个函数中有两个双循环的情况,原代码:

double eval_pef(int n, int delta, double mass, double grav, double sep,
                double fcon, double *x, double *y, double *z, double *fx,
                double *fy, double *fz) {
  double pe, rlen, xdiff, ydiff, zdiff, vmag;
  int nx, ny, dx, dy,index;

  pe = 0.0;
  // loop over particles
    for (nx = 0; nx < n; nx++) {
      for (ny = 0; ny < n; ny++) {
        index  =nx * n + ny;
      fx[index] = 0.0;
      fy[index] = 0.0;
      fz[index] = -mass * grav;
      // loop over displacements
      for (dy = MAX(ny - delta, 0); dy < MIN(ny + delta + 1, n); dy++) {
        for (dx = MAX(nx - delta, 0); dx < MIN(nx + delta + 1, n); dx++) {
          // exclude self interaction
          if (nx != dx || ny != dy) {
            // compute reference distance
            rlen =
                sqrt((double)((nx - dx) * (nx - dx) + (ny - dy) * (ny - dy))) *
                sep;
          
            // compute actual distance
            xdiff = x[dx * n + dy] - x[index];
            ydiff = y[dx * n + dy] - y[index];
            zdiff = z[dx * n + dy] - z[index];
            vmag = sqrt(xdiff * xdiff + ydiff * ydiff + zdiff * zdiff);
  
            // potential energy and force
            pe += fcon * (vmag - rlen) * (vmag - rlen);
            fx[index] += fcon * xdiff * (vmag - rlen) / vmag;
            fy[index] += fcon * ydiff * (vmag - rlen) / vmag;
            fz[index] += fcon * zdiff * (vmag - rlen) / vmag;
          }
        }
      }
    }
  }
  return 0.5 * pe;
}

输出:

n = 2 ,δ = 2

我只将ny和dy增加2,在原始版本中,rlen可以是1或1.414,并且有16次迭代。而在sse2版本中,rlen仍然是1。新循环一定有问题。帮忙...如何解决?

这是我的 sse 版本:

double eval_pef(int n, int delta, double mass, double grav, double sep,
                double fcon, double *x, double *y, double *z, double *fx,
                double *fy, double *fz) {
  double pe, rlen, xdiff, ydiff, zdiff, vmag;
  int nx, ny, dx, dy,index;

  __m128d fcon_value = _mm_set1_pd(fcon);
  __m128d sep_value = _mm_set1_pd(sep);
  pe = 0.0;
  // loop over particles
  
    for (nx = 0; nx < n; nx++) {
      for (ny = 0; ny < n; ny+=2) {
        index  =nx * n + ny;
  
        _mm_store_pd(&fx[index],_mm_set1_pd(0.0));
        _mm_store_pd(&fy[index],_mm_set1_pd(0.0));
        _mm_store_pd(&fz[index],_mm_set1_pd(-mass * grav));


        __m128d nx_value = _mm_set1_pd(nx);
        __m128d ny_value = _mm_setr_pd(ny,ny+1);
      // loop over displacements
      for (dx = MAX(nx - delta, 0); dx < MIN(nx + delta + 1, n); dx++) {
        for (dy = MAX(ny - delta, 0); dy < MIN(ny + delta + 1, n); dy+=2) {
        
          // exclude self interaction
        __m128d dx_value = _mm_set1_pd(dx);
        __m128d dy_value = _mm_setr_pd(dy,dy+1);

        // compute reference distance
        __m128d nx_not_equal_dx = _mm_cmpneq_pd(nx_value, dx_value);
        __m128d ny_not_equal_dy = _mm_cmpneq_pd(ny_value, dy_value);
        __m128d compare_result = _mm_or_pd(nx_not_equal_dx, ny_not_equal_dy);

        __m128d nxdx_diff = _mm_sub_pd(nx_value,dx_value);
        __m128d nydy_diff = _mm_sub_pd(ny_value,dy_value);
        __m128d add_diff = _mm_add_pd(_mm_mul_sd(nxdx_diff,nxdx_diff),_mm_mul_sd(nydy_diff,nydy_diff));
        __m128d rlen_value = _mm_mul_pd(_mm_sqrt_pd(add_diff),sep_value);
        // __m128d rlen_value = _mm_set1_pd(sqrt((double)((nx - dx) * (nx - dx) + (ny - dy) * (ny - dy))) *
        //         sep);
          double rlen_arr[2] ;
            _mm_store_pd(rlen_arr,rlen_value);

            std::cout<<"rlen[0]:" << rlen_value[0]<< '\n';
            std::cout<<"rlen[1]:"<<rlen_value[1]<< '\n';
       
   
            // compute actual distance
            __m128d x_value_dxy = _mm_load_pd(&x[dx * n + dy]);
            __m128d y_value_dxy = _mm_load_pd(&y[dx * n + dy]);
            __m128d z_value_dxy = _mm_load_pd(&z[dx * n + dy]);
            __m128d x_value = _mm_load_pd(&x[index]);
            __m128d y_value = _mm_load_pd(&y[index]);
            __m128d z_value = _mm_load_pd(&z[index]);

            __m128d xdiff = _mm_sub_pd(x_value_dxy,x_value);
            __m128d ydiff = _mm_sub_pd(y_value_dxy,y_value);
            __m128d zdiff = _mm_sub_pd(z_value_dxy,z_value);

            __m128d xx_diff = _mm_mul_pd(xdiff,xdiff);
            __m128d yy_diff = _mm_mul_pd(ydiff,ydiff);
            __m128d zz_diff = _mm_mul_pd(zdiff,zdiff);
            __m128d xx_yy_diff = _mm_add_pd(xx_diff,yy_diff);
            __m128d xx_yy_zz_diff =  _mm_add_pd(xx_yy_diff,zz_diff);
            __m128d vmag = _mm_sqrt_pd(xx_yy_zz_diff);
            
            // potential energy and force
            __m128d vrdiff = _mm_sub_pd(vmag,rlen_value);
            __m128d pe_new = _mm_mul_pd(fcon_value,_mm_mul_pd(vrdiff,vrdiff));
            __m128d zero_value = _mm_set1_pd(0.0);
            pe_new = _mm_or_pd(_mm_and_pd(compare_result, pe_new),_mm_andnot_pd(compare_result, zero_value));
            double result_arr[2] ;
            _mm_store_pd(result_arr,pe_new);

            pe+=result_arr[0]+result_arr[1];

            
            __m128d fx_values = _mm_load_pd(&fx[index]);
            __m128d fy_values = _mm_load_pd(&fy[index]);
            __m128d fz_values = _mm_load_pd(&fz[index]);

            __m128d factor = _mm_mul_pd(fcon_value,_mm_div_pd(vrdiff,vmag));
            __m128d fx_new = _mm_add_pd(fx_values,_mm_mul_pd(factor,xdiff));
            __m128d fy_new = _mm_add_pd(fy_values,_mm_mul_pd(factor,ydiff));
            __m128d fz_new = _mm_add_pd(fz_values,_mm_mul_pd(factor,zdiff));

            fx_new = _mm_or_pd(_mm_and_pd(compare_result, fx_new),_mm_andnot_pd(compare_result, fx_values));
            fy_new = _mm_or_pd(_mm_and_pd(compare_result, fy_new),_mm_andnot_pd(compare_result, fy_values));
            fz_new = _mm_or_pd(_mm_and_pd(compare_result, fz_new),_mm_andnot_pd(compare_result, fz_values));

            _mm_store_pd(&fx[index],fx_new);
            _mm_store_pd(&fy[index],fy_new);
            _mm_store_pd(&fz[index],fz_new);

          //}
        }
      }
    }
  }
  return 0.5 * pe;
}

n = 2 , δ = 2 输出:

c++ for-loop simd sse sse2
1个回答
0
投票

很抱歉我的原始代码很难阅读,但我尝试了另一种方法使用 sse2 来优化它。最重要的其实是如何消除if条件分支,而不是sse2的问题。

v_pe = _mm_mul_pd(v_fcon,_mm_mul_pd(v_term, v_term));
_mm_storeu_pd(pe_vals, v_pe);
if (dx == nx && dy == ny){
     pe_vals[0] = 0;
}
if ((dx == nx && dy+1 == ny)|| dy + 1 == MIN(ny + delta + 1, n) ){
     pe_vals[1] = 0;
}
pe += pe_vals[0] + pe_vals[1];
© www.soinside.com 2019 - 2024. All rights reserved.