我有两个使用 CUDA Thrust 操作设备向量中的元素的操作。哪些方法可以更高效地实现这两个功能?
将一个向量的部分值替换为另一个向量的值。示例如下:
arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
arr2 = [1, 1, 1, 2, 2, 2]
// After replacing {4, 5, 6} and {10, 11, 12} in batch = 3:
arr1 = [1, 2, 3, 1, 1, 1, 7, 8, 9, 2, 2, 2]
就我而言,我总是有
size(arr1) / size(arr2) = 2
。
我们用索引
arr1
和 1 * batch
替换 2 * batch
中的值。
合并两个向量交替索引。
在 如何合并 2 个向量交替索引? 中提出了同样的问题,但适用于
R
语言。
arr1 = [1, 2, 3, 4, 5, 6]
arr2 = [1, 1, 1, 2, 2, 2]
//After merging arr1 and arr2:
arr3 = [1, 2, 3, 1, 1, 1, 4, 5, 6, 2, 2, 2]
replace_copy_if
可能有用,但我不知道如何将它与奇特的迭代器结合起来。此外,一些博客显示 replace_copy_if
比 copy_if
慢。
这个操作将
arr2
中的值分散到arr1
中,所以我们使用thrust::scatter
。值分散到的索引可以使用 thrust::transform_iterator
和 thrust::counting_iterator
来计算(如 std::ranges::views::iota
)。对于批次 indices
作为另一个输入 thrust::device_vector
给出的一般情况,您可以使用
const auto indices_ptr = indices.data();
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch, indices_ptr]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return indices_ptr[batch_id] * batch + elem_id;
});
而在您只想分散到奇数批次的特定情况下,您应该使用
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return (batch_id * 2 + 1) * batch + elem_id;
});
scatter
操作本身很简单:
thrust::scatter(
arr2.cbegin(), arr2.cend(),
scatter_indices_it,
arr1.begin());
当然有多种可能的方法可以做到这一点。我首先想到的是将两个向量与
thrust::merge_by_key
合并,其中使用与上面的散点索引类似的方案生成键:
const auto merge_keys1_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2;
});
const auto merge_keys2_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2 + 1;
});
thrust::merge_by_key(
merge_keys1_it, merge_keys1_it + arr1.size(),
merge_keys2_it, merge_keys2_it + arr2.size(),
arr1.cbegin(),
arr2.cbegin(),
thrust::make_discard_iterator(),
arr3.begin());
这可行并且相对优雅,但对于性能来说可能并不理想。我稍后会添加一个性能更高的版本。
#include <iostream>
#include <iterator>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/scatter.h>
#include <thrust/merge.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
template <typename T>
void print(const thrust::device_vector<T> &vec) {
thrust::host_vector<int> h_vec(vec);
std::ostream_iterator<int> out_it(std::cout, ", ");
thrust::copy(h_vec.cbegin(), h_vec.cend(),
out_it);
std::cout << '\n';
}
void part1() {
constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
constexpr int batch = 3;
thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
assert(arr1.size() == 2 * arr2.size());
#if 1
// specialized version where arr2 is scattered to "uneven" batches
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return (batch_id * 2 + 1) * batch + elem_id;
});
#else
// more general version where arr2 is scattered to batches given in indices
constexpr int h_indices[] = {1, 3};
thrust::device_vector<int> indices(std::cbegin(h_indices), std::cend(h_indices));
const auto indices_ptr = indices.data();
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch, indices_ptr]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return indices_ptr[batch_id] * batch + elem_id;
});
#endif
thrust::scatter(
arr2.cbegin(), arr2.cend(),
scatter_indices_it,
arr1.begin());
print(arr1);
}
void part2() {
constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6};
constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
constexpr int batch = 3;
thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
thrust::device_vector<int> arr3(arr1.size() + arr2.size());
assert(arr1.size() == arr2.size());
const auto merge_keys1_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2;
});
const auto merge_keys2_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2 + 1;
});
thrust::merge_by_key(
merge_keys1_it, merge_keys1_it + arr1.size(),
merge_keys2_it, merge_keys2_it + arr2.size(),
arr1.cbegin(),
arr2.cbegin(),
thrust::make_discard_iterator(),
arr3.begin());
print(arr3);
}
int main(void) {
part1();
part2();
}