我有一个 CUDA 内核,基本上如下所示。
__global__ void myOpKernel(double *vals, double *data, int *nums, double *crit, int N, int K) {
int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index >= N) return;
double _crit = crit[index];
for (int i=0; i<nums[index]; i++) {
double _res = vals[index*K + i];
if (data[index*K + i] >= _crit) { _res = 0.0; }
vals[index*K + i] = _res;
}
}
该内核根据其
vals[N*K]
与 data[N*K]
的比较来评估 crit[N]
,并且比较是在 nums[N]
的段(宽度 K)的前 vals
元素上进行的。如果data
小于crit
,则vals
保持不变。
例如,正在考虑的数据将如下所示
int N = 3;
int K = 5;
vals[ 0] = 1.0; data[ 0] = 5.1; crit[0] = 5.0; nums[0] = 3;
vals[ 1] = 1.0; data[ 1] = 4.9;
vals[ 2] = 1.0; data[ 2] = 3.0;
vals[ 3] = 0.0; data[ 3] = 0.0;
vals[ 4] = 0.0; data[ 4] = 0.0;
//-----------------------
vals[ 5] = 1.0; data[ 5] = 2.9; crit[1] = 3.0; nums[1] = 2;
vals[ 6] = 1.0; data[ 6] = 3.1;
vals[ 7] = 0.0; data[ 7] = 0.0;
vals[ 8] = 0.0; data[ 8] = 0.0;
vals[ 9] = 0.0; data[ 9] = 0.0;
//-----------------------
vals[10] = 1.0; data[10] = 8.1; crit[2] = 9.0; nums[2] = 5;
vals[11] = 1.0; data[11] = 7.8;
vals[12] = 1.0; data[12] = 9.1;
vals[13] = 1.0; data[13] = 200.;
vals[14] = 1.0; data[14] = -1.0;
我注意到这种操作是Top 3耗时内核之一,正在考虑基于Thrust的加速。
我到目前为止的想法如下所示。它使用 Thrust 示例中提供的
expand
(https://github.com/NVIDIA/thrust/blob/master/examples/expand.cu)。
struct myOp : public thrust::unary_function<thrust::tuple<double,double,int,int,int,double>, double> {
// vals data 1/K 1%K nums crit
__host__ __device__ // 0 1 2 3 4 5
double operator() (const thrust::tuple<double,double,int,int,int, double> &t) const {
double res;
if (thrust::get<2>(t) >= thrust::get<4>(t)) {
res = thrust::get<0>(t); // do nothing
}else {
if (thrust::get<3>(t) >= thrust::get<4>(t)) {
res = thrust::get<0>(t); // do nothing
}else {
double tmp = thrust::get<0>(t);
if (thrust::get<1>(t) >= thrust::get<5>(t)) { tmp = 0.0; }
res = tmp;
}
}
return res;
}
};
int main() {
using namespace thrust::placeholders;
thrust::device_vector<double> vals(N*K);
thrust::device_vector<double> data(N*K);
thrust::device_vector<double> crit(N);
thrust::device_vector<int> nums(N);
thrust::device_vector<double> res(N*K);
// ... fill values ...
thrust::device_vector<int> nums_expand(N*K);
thrust::device_vector<double> crit_expand(N*K);
// 'expand()' does something like [1,2,3] -> [1,1,1,2,2,2,3,3,3]
expand(thrust::constant_iterator<int>(K),
thrust::constant_iterator<int>(K)+N,
nums.begin(),
nums_expand.begin());
expand(thrust::constant_iterator<int>(K),
thrust::constant_iterator<int>(K)+N,
crit.begin(),
crit_expand.begin());
thrust::transform(thrust::make_zip_iterator(vals.begin(),
data.begin(),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K), // index related to N
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K), // index related to K
nums_expand.begin(),
crit_expand.begin()),
thrust::make_zip_iterator(vals.end(),
data.end(),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K) + N*K,
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K) + N*K,
nums_expand.end(),
crit_expand.end()),
res.begin(),
myOp());
...
}
当我在数组中使用任意值尝试使用
[N,K]
= [1000,256]
、[10000,256]
、[50000,256]
、[100000,256]
的集合时,性能已经令人满意。
但我想知道我的 Thrust 操作是否还有进一步加速的机会。我正在扩展一些值以将它们带入
if
语句,但也许这可以通过 permutation_iterator
等来避免,但我想不出如何。另外,我正在做 _1/K
,_1%K
的东西来获取元素的全局和局部索引,更聪明的头脑可以以某种方式避免这种情况。
至少,从化妆品的角度来看,我很乐意将
expand(...)
直接插入到 thrust::transform(...)
中,而不必定义另一个向量,例如 nums_expand
.
欢迎任何改进机会的建议。
用于比较的完整代码
//https://stackoverflow.com/questions/31955505/can-thrust-transform-reduce-work-with-2-arrays%5B/url%5D
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/gather.h>
#include <thrust/copy.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
//#include <thrust/execution_policy.h>
#include <iostream>
#include <iomanip>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <helper_timer.h>
///////// https://github.com/NVIDIA/thrust/blob/master/examples/expand.cu //////////
template <typename InputIterator1,
typename InputIterator2,
typename OutputIterator>
OutputIterator expand(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator output)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
difference_type input_size = thrust::distance(first1, last1);
difference_type output_size = thrust::reduce(first1, last1);
// scan the counts to obtain output offsets for each input element
thrust::device_vector<difference_type> output_offsets(input_size, 0);
thrust::exclusive_scan(first1, last1, output_offsets.begin());
// scatter the nonzero counts into their corresponding output positions
thrust::device_vector<difference_type> output_indices(output_size, 0);
thrust::scatter_if
(thrust::counting_iterator<difference_type>(0),
thrust::counting_iterator<difference_type>(input_size),
output_offsets.begin(),
first1,
output_indices.begin());
// compute max-scan over the output indices, filling in the holes
thrust::inclusive_scan
(output_indices.begin(),
output_indices.end(),
output_indices.begin(),
thrust::maximum<difference_type>());
// gather input values according to index array (output = first2[output_indices])
thrust::gather(output_indices.begin(),
output_indices.end(),
first2,
output);
// return output + output_size
thrust::advance(output, output_size);
return output;
}
/////////////////////////////////////////////////////////////////////////////////////
template<typename T>
void print_vector(T& vec) {
for (const auto& elem : vec) {
std::cout << std::setw(5) << elem;
}
std::cout << std::endl;
}
void printSdkTimer(StopWatchInterface **timer, int average) {
float fAvgSeconds =
((float)1.0e-3 * (float)sdkGetTimerValue(timer) / (float)average);
printf(" - Elapsed time: %.5f sec \n", fAvgSeconds);
}
struct myOp : public thrust::unary_function<thrust::tuple<double,double,int,int,int,double>, double> {
// vals data 1/K 1%K nums crit
__host__ __device__ // 0 1 2 3 4 5
double operator() (const thrust::tuple<double,double,int,int,int, double> &t) const {
double res;
if (thrust::get<2>(t) >= thrust::get<4>(t)) {
res = thrust::get<0>(t); // do nothing
}else {
if (thrust::get<3>(t) >= thrust::get<4>(t)) {
res = thrust::get<0>(t); // do nothing
}else {
double tmp = thrust::get<0>(t);
if (thrust::get<1>(t) >= thrust::get<5>(t)) { tmp = 0.0; }
res = tmp;
}
}
return res;
}
};
__global__ void myOpKernel(double *vals, double *data, int *nums, double *crit, int N, int K) {
int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index >= N) return;
double _crit = crit[index];
for (int i=0; i<nums[index]; i++) {
double _res = vals[index*K + i];
if (data[index*K + i] >= _crit) { _res = 0.0; }
vals[index*K + i] = _res;
}
}
int main(int argc, char **argv) {
using namespace thrust::placeholders;
int N = atoi(argv[1]);
int K = atoi(argv[2]);
std::cout << "N " << N << " K " << K << std::endl;
thrust::device_vector<double> vals(N*K);
thrust::device_vector<double> data(N*K);
thrust::device_vector<double> crit(N);
thrust::device_vector<int> nums(N);
thrust::device_vector<double> res(N*K);
for (int i=0; i<N; i++) {
crit[i] = 101.0; // arbitrary
nums[i] = 200; // arbitrary number less than 256
for (int j=0; j<K; j++) {
vals[i*K + j] = (double)(i*K + j); // arbitrary
data[i*K + j] = (double)(i*K + j); // arbitrary
}
}
// to be used for kernel
thrust::device_vector<double> vals2 = vals;
thrust::device_vector<double> data2 = data;
thrust::device_vector<double> crit2 = crit;
thrust::device_vector<int> nums2 = nums;
StopWatchInterface *timer=NULL;
//--- 1) thrust
thrust::device_vector<int> nums_expand(N*K);
thrust::device_vector<double> crit_expand(N*K);
expand(thrust::constant_iterator<int>(K),
thrust::constant_iterator<int>(K)+N,
nums.begin(),
nums_expand.begin());
expand(thrust::constant_iterator<int>(K),
thrust::constant_iterator<int>(K)+N,
crit.begin(),
crit_expand.begin());
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
thrust::transform(thrust::make_zip_iterator(vals.begin(),
data.begin(),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K), // for N
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K), // for K
nums_expand.begin(),
crit_expand.begin()),
thrust::make_zip_iterator(vals.end(),
data.end(),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K) + N*K,
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K) + N*K,
nums_expand.end(),
crit_expand.end()),
res.begin(),
myOp());
sdkStopTimer(&timer);
printSdkTimer(&timer,1);
cudaDeviceSynchronize();
sdkResetTimer(&timer);
sdkStartTimer(&timer);
//--- 2) kernel
double *raw_vals2 = thrust::raw_pointer_cast(vals2.data());
double *raw_data2 = thrust::raw_pointer_cast(data2.data());
double *raw_crit2 = thrust::raw_pointer_cast(crit2.data());
int *raw_nums2 = thrust::raw_pointer_cast(nums2.data());
int Nthreads = 256;
int Nblocks = (N*K - 1) / Nthreads + 1;
myOpKernel<<<Nblocks,Nthreads>>>(raw_vals2, raw_data2, raw_nums2, raw_crit2, N, K);
cudaDeviceSynchronize();
sdkStopTimer(&timer);
printSdkTimer(&timer,1);
sdkDeleteTimer(&timer);
return 0;
}