使用 CUDA Thrust 在向量中进行替换/合并操作

Question

我有两个使用 CUDA Thrust 操作设备向量中的元素的操作。哪些方法可以更高效地实现这两个功能？

将一个向量的部分值替换为另一个向量的值。示例如下：

arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
arr2 = [1, 1, 1, 2, 2, 2]
// After replacing {4, 5, 6} and {10, 11, 12} in batch = 3:
arr1 = [1, 2, 3, 1, 1, 1, 7, 8, 9, 2, 2, 2]

就我而言，我总是有

size(arr1) / size(arr2) = 2

。

我们用索引

arr1

和

1 * batch

替换

2 * batch

中的值。

合并两个向量交替索引。

在如何合并 2 个向量交替索引？中提出了同样的问题，但适用于
```
R
```
语言。
```
arr1 = [1, 2, 3, 4, 5, 6]
arr2 = [1, 1, 1, 2, 2, 2]
//After merging arr1 and arr2:
arr3 = [1, 2, 3, 1, 1, 1, 4, 5, 6, 2, 2, 2]
```
```
replace_copy_if
```
可能有用，但我不知道如何将它与奇特的迭代器结合起来。此外，一些博客显示
```
replace_copy_if
```
比
```
copy_if
```
慢。

Answer 1

这个操作将

arr2

中的值分散到

arr1

中，所以我们使用

thrust::scatter

。值分散到的索引可以使用

thrust::transform_iterator

和

thrust::counting_iterator

来计算（如

std::ranges::views::iota

）。对于批次

indices

作为另一个输入

thrust::device_vector

给出的一般情况，您可以使用

const auto indices_ptr = indices.data();

const auto scatter_indices_it = thrust::make_transform_iterator(
    thrust::make_counting_iterator(0),
    [batch, indices_ptr]
    __host__ __device__ (int idx) {
        const int batch_id = idx / batch;
        const int elem_id = idx % batch;
        return indices_ptr[batch_id] * batch + elem_id;
    });

而在您只想分散到奇数批次的特定情况下，您应该使用

const auto scatter_indices_it = thrust::make_transform_iterator(
    thrust::make_counting_iterator(0),
    [batch]
    __host__ __device__ (int idx) {
        const int batch_id = idx / batch;
        const int elem_id = idx % batch;
        return (batch_id * 2 + 1) * batch + elem_id;
    });

scatter

操作本身很简单：

thrust::scatter(
    arr2.cbegin(), arr2.cend(),
    scatter_indices_it,
    arr1.begin());

当然有多种可能的方法可以做到这一点。我首先想到的是将两个向量与

thrust::merge_by_key

合并，其中使用与上面的散点索引类似的方案生成键：

const auto merge_keys1_it = thrust::make_transform_iterator(
    thrust::make_counting_iterator(0),
    [batch]
    __host__ __device__ (int idx) {
        const int batch_id = idx / batch;
        return batch_id * 2;
    });
const auto merge_keys2_it = thrust::make_transform_iterator(
    thrust::make_counting_iterator(0),
    [batch]
    __host__ __device__ (int idx) {
        const int batch_id = idx / batch;
        return batch_id * 2 + 1;
    });
thrust::merge_by_key(
    merge_keys1_it, merge_keys1_it + arr1.size(),
    merge_keys2_it, merge_keys2_it + arr2.size(),
    arr1.cbegin(),
    arr2.cbegin(),
    thrust::make_discard_iterator(),
    arr3.begin());

这可行并且相对优雅，但对于性能来说可能并不理想。我稍后会添加一个性能更高的版本。

完整源代码：

#include <iostream>
#include <iterator>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

#include <thrust/scatter.h>
#include <thrust/merge.h>

#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>

template <typename T>
void print(const thrust::device_vector<T> &vec) {
    thrust::host_vector<int> h_vec(vec);
    std::ostream_iterator<int> out_it(std::cout, ", ");
    thrust::copy(h_vec.cbegin(), h_vec.cend(),
                 out_it);
    std::cout << '\n';
}

void part1() {
    constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
    constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
    constexpr int batch = 3;

    thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
    thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
    
    assert(arr1.size() == 2 * arr2.size());

#if 1
    // specialized version where arr2 is scattered to "uneven" batches
    const auto scatter_indices_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            const int elem_id = idx % batch;
            return (batch_id * 2 + 1) * batch + elem_id;
        });
#else
    // more general version where arr2 is scattered to batches given in indices
    constexpr int h_indices[] = {1, 3};
    thrust::device_vector<int> indices(std::cbegin(h_indices), std::cend(h_indices));
    const auto indices_ptr = indices.data();

    const auto scatter_indices_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch, indices_ptr]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            const int elem_id = idx % batch;
            return indices_ptr[batch_id] * batch + elem_id;
        });
#endif
    
    thrust::scatter(
        arr2.cbegin(), arr2.cend(),
        scatter_indices_it,
        arr1.begin());

    print(arr1);
}

void part2() {
    constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6};
    constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
    constexpr int batch = 3;

    thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
    thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
    thrust::device_vector<int> arr3(arr1.size() + arr2.size());

    assert(arr1.size() == arr2.size());

    const auto merge_keys1_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            return batch_id * 2;
        });
    const auto merge_keys2_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            return batch_id * 2 + 1;
        });
    
    thrust::merge_by_key(
        merge_keys1_it, merge_keys1_it + arr1.size(),
        merge_keys2_it, merge_keys2_it + arr2.size(),
        arr1.cbegin(),
        arr2.cbegin(),
        thrust::make_discard_iterator(),
        arr3.begin());

    print(arr3);
}

int main(void) {
    part1();
    part2();
}

使用 CUDA Thrust 在向量中进行替换/合并操作

问题描述投票：0回答：1

1个回答

完整源代码：

最新问题

使用 CUDA Thrust 在向量中进行替换/合并操作

问题描述 投票：0回答：1

1个回答

完整源代码：

最新问题

问题描述投票：0回答：1