任务就像如何并行有效地设置位向量的位?,但是对于 CUDA。
考虑其中包含
N
位的位向量(N
很大,例如 4G)和 M
数字数组(M
也很大,例如 1G),每个数字都在 0..N-1
范围内,指示哪个向量的位必须设置为 1。位向量只是一个整数数组,具体为 uint32_t
。
我在全局内存上尝试了使用
atomicOr()
进行简单的实现:
__global__ void BitwiseSet(const uint32_t n_indices, const uint32_t *indices,
const uint32_t n_bits, uint32_t *bitset)
{
const uint32_t n_threads = blockDim.x * gridDim.x;
const uint32_t i_thread = threadIdx.x + blockDim.x * blockIdx.x;
for(uint32_t i=i_thread; i<n_indices; i +=n_threads) {
const uint32_t index = indices[i];
assert(index < n_bits);
const uint32_t i_word = index >> 5;
const uint32_t i_bit = index & 31;
atomicOr(bitset+i_word, 1u<<(i_bit));
}
}
它会为
4G
位和 1G
索引产生有趣的结果:
0.0383266
秒。对于排序索引 vs. 0.332674
秒。对于未排序(8.68x
改进)0.0564464
秒。对于排序索引 vs. 1.23666
秒。对于未排序(21.91x
改进)因此,设备似乎在一个 warp 内合并/联合多个
atomicOr()
操作,并且 L1 缓存可能会启动,因此当索引冲突时(排序索引就是这种情况),32 位分配实际上比非冲突索引(未排序的情况)。
我们可以进一步改进已排序或未排序的情况吗?
更新:回答评论,任何解决方案都是有意义的,无论是排序还是未排序的情况,有或没有重复。未排序且有重复是更通用的情况,因此它是最令人感兴趣的。
更新2:按照向量化内存访问的建议,我实现了这个:
__global__ void BitwiseSet(const uint32_t n_indices, const uint32_t *indices, const uint32_t n_bits, uint32_t *bitset) {
const uint32_t n_threads = blockDim.x * gridDim.x;
const uint32_t i_thread = threadIdx.x + blockDim.x * blockIdx.x;
const uint32_t n_vectors = n_indices / 4;
for(uint32_t i=i_thread; i<n_vectors; i +=n_threads) {
const uint4 v_index = reinterpret_cast<const uint4*>(indices)[i];
assert(v_index.x < n_bits);
assert(v_index.y < n_bits);
assert(v_index.z < n_bits);
assert(v_index.w < n_bits);
uint4 vi_word, vi_bit;
vi_word.x = v_index.x >> 5;
vi_word.y = v_index.y >> 5;
vi_word.z = v_index.z >> 5;
vi_word.w = v_index.w >> 5;
vi_bit.x = v_index.x & 31;
vi_bit.y = v_index.y & 31;
vi_bit.z = v_index.z & 31;
vi_bit.w = v_index.w & 31;
atomicOr(bitset+vi_word.x, 1u<<vi_bit.x);
atomicOr(bitset+vi_word.y, 1u<<vi_bit.y);
atomicOr(bitset+vi_word.z, 1u<<vi_bit.z);
atomicOr(bitset+vi_word.w, 1u<<vi_bit.w);
}
if(i_thread < 4) {
const uint32_t tail_start = n_vectors*4;
const uint32_t tail_len = n_indices - tail_start;
if(i_thread < tail_len) {
const uint32_t index = indices[tail_start+i_thread];
assert(index < n_bits);
const uint32_t i_word = index >> 5;
const uint32_t i_bit = index & 31;
atomicOr(bitset+i_word, 1u<<i_bit);
}
}
}
但至少在 RTX2080 上速度较慢(我现在没有带 RTX3090 的 eGPU 来测试):
0.0815998
秒。对于排序 vs. 1.39829
秒。对于未排序(17.14x
比率)这不是完整的答案,但我有太多代码无法评论。
您的代码主要受到分散原子写入的限制。
所以你很难指望最大化内存总线。只有在每次写入时都充分利用缓存行(即仅合并写入)时,您才能这样做。
但是,您可以通过使用
memcpy_async
预取数据来加快速度(可能高达 30%)。
您需要预取足够的数据来克服延迟。
//prefetch count cannot be greater than 8!
template <int my_blockdim, int prefetchcount>
__global__ void BitwiseSet(const uint32_t n_indices, const uint32_t *indices,
const uint32_t n_bits, uint32_t *bitset)
{
constexpr auto buffersize = myblockdim * prefetchcount;
__shared__ s_indices[buffersize];
auto pipeline = cuda::make_pipeline(); //pipeline with thread_scope_thread
//every block handles its own section of the data.
const auto start = blockDim.x * blockIdx.x;
const auto end = std::min(n_indices, start + ((n_indices + gridDim.x - 1) / gridDim.x);
const auto prefetch = [&](uint32_t i){
//pipeline.producer_acquire(); //no-op for thread_scope_thread
const auto source = &indices[start + i];
const auto dest = &s_indices[i % buffersize];
constexpr auto size = sizeof(int);
memcpy_async(dest, source, size, pipeline);
pipeline.producer_commit();
};
//prime the pump
for (auto i = 0; i < prefetchcount; i ++) {
const auto a = start + threadIdx.x + (blockDim.x * i);
prefetch(a);
}
const auto dowork = [&]<bool in_tail>(uint32_t start, uint32_t end) {
//skip prefetch items, we'll process those in the tail.
for (uint32_t i = start + threadIdx.x; i < end; i += blockDim.x) {
pipeline.consumer_wait(); //wait for one batch
//__syncwarp(); no need for sync here
const auto index = s_indices[i % buffersize]; //fast because mod by constant
//prefetch the next batch
if constexpr (in_tail) {
prefetch(i);
}
//const uint32_t index = indices[i];
assert(index < n_bits);
const uint32_t i_word = index >> 5;
const uint32_t i_bit = index & 31;
atomicOr(bitset+i_word, (1u << i_bit));
};
const auto start2 = start + buffersize;
dowork.template operator()<false>(start2, end);
dowork.template operator()<true>(0, buffersize);
}
您可以根据需要展开它,方法是每个
memcpy_async()
执行多个 pipeline.producer_commit()
并根据需要调整其余部分。