进一步优化 CUDA 内核的 Thrust 操作的机会

Question

我有一个 CUDA 内核，基本上如下所示。

__global__ void myOpKernel(double *vals, double *data, int *nums, double *crit, int N, int K) {
  int index = blockIdx.x*blockDim.x + threadIdx.x;

  if (index >= N) return;

  double _crit = crit[index];
  for (int i=0; i<nums[index]; i++) {
    double _res = vals[index*K + i];
    if (data[index*K + i] >= _crit) { _res = 0.0; }

    vals[index*K + i] = _res;
  }
}

该内核根据其

vals[N*K]

与

data[N*K]

的比较来评估

crit[N]

，并且比较是在

nums[N]

的段（宽度 K）的前

vals

元素上进行的。如果

data

小于

crit

，则

vals

保持不变。

例如，正在考虑的数据将如下所示

  int N = 3;
  int K = 5;

  vals[ 0] = 1.0; data[ 0] = 5.1; crit[0] = 5.0; nums[0] = 3;
  vals[ 1] = 1.0; data[ 1] = 4.9;
  vals[ 2] = 1.0; data[ 2] = 3.0;
  vals[ 3] = 0.0; data[ 3] = 0.0;
  vals[ 4] = 0.0; data[ 4] = 0.0;
  //-----------------------
  vals[ 5] = 1.0; data[ 5] = 2.9; crit[1] = 3.0; nums[1] = 2;
  vals[ 6] = 1.0; data[ 6] = 3.1;
  vals[ 7] = 0.0; data[ 7] = 0.0;
  vals[ 8] = 0.0; data[ 8] = 0.0;
  vals[ 9] = 0.0; data[ 9] = 0.0;
  //-----------------------
  vals[10] = 1.0; data[10] = 8.1; crit[2] = 9.0; nums[2] = 5;
  vals[11] = 1.0; data[11] = 7.8;
  vals[12] = 1.0; data[12] = 9.1;
  vals[13] = 1.0; data[13] = 200.;
  vals[14] = 1.0; data[14] = -1.0;

我注意到这种操作是Top 3耗时内核之一，正在考虑基于Thrust的加速。

我到目前为止的想法如下所示。它使用 Thrust 示例中提供的

expand

（https://github.com/NVIDIA/thrust/blob/master/examples/expand.cu）。

struct myOp : public thrust::unary_function<thrust::tuple<double,double,int,int,int,double>, double> {
                                        // vals   data   1/K 1%K nums crit
  __host__ __device__                   // 0      1      2   3   4    5
    double operator() (const thrust::tuple<double,double,int,int,int, double> &t) const {
      double res;

      if (thrust::get<2>(t) >= thrust::get<4>(t)) {
        res = thrust::get<0>(t);  // do nothing
      }else {
        if (thrust::get<3>(t) >= thrust::get<4>(t)) {
          res = thrust::get<0>(t); // do nothing
        }else {
          double tmp = thrust::get<0>(t);
          if (thrust::get<1>(t) >= thrust::get<5>(t)) { tmp = 0.0; }
          res = tmp;
        }
      }

      return res;
    }
};

int main() {

  using namespace thrust::placeholders;

  thrust::device_vector<double> vals(N*K);
  thrust::device_vector<double> data(N*K);
  thrust::device_vector<double> crit(N);
  thrust::device_vector<int>    nums(N);

  thrust::device_vector<double> res(N*K);

  // ... fill values ...

  thrust::device_vector<int>    nums_expand(N*K);
  thrust::device_vector<double> crit_expand(N*K);

  // 'expand()' does something like [1,2,3] -> [1,1,1,2,2,2,3,3,3]
  expand(thrust::constant_iterator<int>(K),
         thrust::constant_iterator<int>(K)+N,
         nums.begin(),
         nums_expand.begin());

  expand(thrust::constant_iterator<int>(K),
         thrust::constant_iterator<int>(K)+N,
         crit.begin(),
         crit_expand.begin());

  thrust::transform(thrust::make_zip_iterator(vals.begin(),
                                              data.begin(),
                                              thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K), // index related to N
                                              thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K), // index related to K
                                              nums_expand.begin(),
                                              crit_expand.begin()),
                    thrust::make_zip_iterator(vals.end(),
                                              data.end(),
                                              thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K) + N*K,
                                              thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K) + N*K,
                                              nums_expand.end(),
                                              crit_expand.end()),
                    res.begin(),
                    myOp());

  ...

}

当我在数组中使用任意值尝试使用

[N,K]

=

[1000,256]

、

[10000,256]

、

[50000,256]

、

[100000,256]

的集合时，性能已经令人满意。

但我想知道我的 Thrust 操作是否还有进一步加速的机会。我正在扩展一些值以将它们带入

if

语句，但也许这可以通过

permutation_iterator

等来避免，但我想不出如何。另外，我正在做

_1/K

，

_1%K

的东西来获取元素的全局和局部索引，更聪明的头脑可以以某种方式避免这种情况。

至少，从化妆品的角度来看，我很乐意将

expand(...)

直接插入到

thrust::transform(...)

中，而不必定义另一个向量，例如

nums_expand

.

欢迎任何改进机会的建议。

用于比较的完整代码

//https://stackoverflow.com/questions/31955505/can-thrust-transform-reduce-work-with-2-arrays%5B/url%5D

#include <thrust/device_vector.h>

#include <thrust/reduce.h>
#include <thrust/gather.h>
#include <thrust/copy.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
//#include <thrust/execution_policy.h>
#include <iostream>
#include <iomanip>

#include <thrust/transform.h>
#include <thrust/functional.h>

#include <helper_timer.h>

/////////  https://github.com/NVIDIA/thrust/blob/master/examples/expand.cu //////////
template <typename InputIterator1,
          typename InputIterator2,
          typename OutputIterator>
OutputIterator expand(InputIterator1 first1,
                      InputIterator1 last1,
                      InputIterator2 first2,
                      OutputIterator output)
{
  typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
  
  difference_type input_size  = thrust::distance(first1, last1);
  difference_type output_size = thrust::reduce(first1, last1);

  // scan the counts to obtain output offsets for each input element
  thrust::device_vector<difference_type> output_offsets(input_size, 0);
  thrust::exclusive_scan(first1, last1, output_offsets.begin()); 

  // scatter the nonzero counts into their corresponding output positions
  thrust::device_vector<difference_type> output_indices(output_size, 0);
  thrust::scatter_if
    (thrust::counting_iterator<difference_type>(0),
     thrust::counting_iterator<difference_type>(input_size),
     output_offsets.begin(),
     first1,
     output_indices.begin());

  // compute max-scan over the output indices, filling in the holes
  thrust::inclusive_scan
    (output_indices.begin(),
     output_indices.end(),
     output_indices.begin(),
     thrust::maximum<difference_type>());

  // gather input values according to index array (output = first2[output_indices])
  thrust::gather(output_indices.begin(),
                 output_indices.end(),
                 first2,
                 output);

  // return output + output_size
  thrust::advance(output, output_size);
  return output;
}

/////////////////////////////////////////////////////////////////////////////////////

template<typename T>
void print_vector(T& vec) {
  for (const auto& elem : vec) {
    std::cout << std::setw(5) << elem; 
  }
  std::cout << std::endl;
}

void printSdkTimer(StopWatchInterface **timer, int average) {
  float fAvgSeconds =
    ((float)1.0e-3 * (float)sdkGetTimerValue(timer) / (float)average);
  printf(" - Elapsed time: %.5f sec \n", fAvgSeconds);
}

struct myOp : public thrust::unary_function<thrust::tuple<double,double,int,int,int,double>, double> {
                                  // vals   data   1/K 1%K nums crit
  __host__ __device__             // 0      1      2   3   4    5
    double operator() (const thrust::tuple<double,double,int,int,int, double> &t) const {
      double res;

      if (thrust::get<2>(t) >= thrust::get<4>(t)) {
        res = thrust::get<0>(t);  // do nothing
      }else {
        if (thrust::get<3>(t) >= thrust::get<4>(t)) {
          res = thrust::get<0>(t); // do nothing
        }else {
          double tmp = thrust::get<0>(t);
          if (thrust::get<1>(t) >= thrust::get<5>(t)) { tmp = 0.0; }
          res = tmp;
        }
      }

      return res;
    }
};

__global__ void myOpKernel(double *vals, double *data, int *nums, double *crit, int N, int K) {
  int index = blockIdx.x*blockDim.x + threadIdx.x;

  if (index >= N) return;

  double _crit = crit[index];
  for (int i=0; i<nums[index]; i++) {
    double _res = vals[index*K + i];
    if (data[index*K + i] >= _crit) { _res = 0.0; }  

    vals[index*K + i] = _res;
  }
}

int main(int argc, char **argv) {

  using namespace thrust::placeholders;

  int N = atoi(argv[1]); 
  int K = atoi(argv[2]); 

  std::cout << "N " << N << " K " << K << std::endl;

  thrust::device_vector<double> vals(N*K);
  thrust::device_vector<double> data(N*K);
  thrust::device_vector<double> crit(N);
  thrust::device_vector<int>    nums(N);

  thrust::device_vector<double> res(N*K);

  for (int i=0; i<N; i++) {
    crit[i] = 101.0; // arbitrary
    nums[i] = 200;   // arbitrary number less than 256
    for (int j=0; j<K; j++) {
      vals[i*K + j] = (double)(i*K + j); // arbitrary
      data[i*K + j] = (double)(i*K + j); // arbitrary
    }
  }

  // to be used for kernel
  thrust::device_vector<double> vals2 = vals;
  thrust::device_vector<double> data2 = data;
  thrust::device_vector<double> crit2 = crit;
  thrust::device_vector<int>    nums2 = nums;

  StopWatchInterface *timer=NULL;
 
//--- 1) thrust
  thrust::device_vector<int>    nums_expand(N*K);
  thrust::device_vector<double> crit_expand(N*K);

  expand(thrust::constant_iterator<int>(K),
         thrust::constant_iterator<int>(K)+N,
         nums.begin(),
         nums_expand.begin());

  expand(thrust::constant_iterator<int>(K),
         thrust::constant_iterator<int>(K)+N,
         crit.begin(),
         crit_expand.begin());

  sdkCreateTimer(&timer);
  sdkStartTimer(&timer);

  thrust::transform(thrust::make_zip_iterator(vals.begin(), 
                                              data.begin(),
                                              thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K), // for N
                                              thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K), // for K
                                              nums_expand.begin(),
                                              crit_expand.begin()),
                    thrust::make_zip_iterator(vals.end(), 
                                              data.end(),
                                              thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K) + N*K, 
                                              thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K) + N*K, 
                                              nums_expand.end(),
                                              crit_expand.end()),
                    res.begin(),
                    myOp());

  sdkStopTimer(&timer);
  printSdkTimer(&timer,1);

  cudaDeviceSynchronize();
  sdkResetTimer(&timer);
  sdkStartTimer(&timer);

//--- 2) kernel
  double *raw_vals2 = thrust::raw_pointer_cast(vals2.data());
  double *raw_data2 = thrust::raw_pointer_cast(data2.data());
  double *raw_crit2 = thrust::raw_pointer_cast(crit2.data());
  int    *raw_nums2 = thrust::raw_pointer_cast(nums2.data());

  int Nthreads = 256;
  int Nblocks = (N*K - 1) / Nthreads + 1;
  myOpKernel<<<Nblocks,Nthreads>>>(raw_vals2, raw_data2, raw_nums2, raw_crit2, N, K);

  cudaDeviceSynchronize();

  sdkStopTimer(&timer);
  printSdkTimer(&timer,1);

  sdkDeleteTimer(&timer);

  return 0;
}

进一步优化 CUDA 内核的 Thrust 操作的机会

问题描述投票：0回答：0

最新问题

进一步优化 CUDA 内核的 Thrust 操作的机会

问题描述 投票：0回答：0

最新问题

问题描述投票：0回答：0