使用 CUDA Thrust 在向量中进行替换/合并操作

问题描述 投票:0回答:1

我有两个使用 CUDA Thrust 操作设备向量中的元素的操作。哪些方法可以更高效地实现这两个功能?

  1. 将一个向量的部分值替换为另一个向量的值。示例如下:

    arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    arr2 = [1, 1, 1, 2, 2, 2]
    // After replacing {4, 5, 6} and {10, 11, 12} in batch = 3:
    arr1 = [1, 2, 3, 1, 1, 1, 7, 8, 9, 2, 2, 2]
    

    就我而言,我总是有

    size(arr1) / size(arr2) = 2

    我们用索引

    arr1
    1 * batch
    替换
    2 * batch
    中的值。

  2. 合并两个向量交替索引。

    如何合并 2 个向量交替索引? 中提出了同样的问题,但适用于

    R
    语言。

    arr1 = [1, 2, 3, 4, 5, 6]
    arr2 = [1, 1, 1, 2, 2, 2]
    //After merging arr1 and arr2:
    arr3 = [1, 2, 3, 1, 1, 1, 4, 5, 6, 2, 2, 2]
    

    replace_copy_if
    可能有用,但我不知道如何将它与奇特的迭代器结合起来。此外,一些博客显示
    replace_copy_if
    copy_if
    慢。

c++ vector cuda thrust
1个回答
0
投票
  1. 这个操作将

    arr2
    中的值分散到
    arr1
    中,所以我们使用
    thrust::scatter
    。值分散到的索引可以使用
    thrust::transform_iterator
    thrust::counting_iterator
    来计算(如
    std::ranges::views::iota
    )。对于批次
    indices
    作为另一个输入
    thrust::device_vector
    给出的一般情况,您可以使用

    const auto indices_ptr = indices.data();
    
    const auto scatter_indices_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch, indices_ptr]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            const int elem_id = idx % batch;
            return indices_ptr[batch_id] * batch + elem_id;
        });
    

    而在您只想分散到奇数批次的特定情况下,您应该使用

    const auto scatter_indices_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            const int elem_id = idx % batch;
            return (batch_id * 2 + 1) * batch + elem_id;
        });
    

    scatter
    操作本身很简单:

    thrust::scatter(
        arr2.cbegin(), arr2.cend(),
        scatter_indices_it,
        arr1.begin());
    
  2. 当然有多种可能的方法可以做到这一点。我首先想到的是将两个向量与

    thrust::merge_by_key
    合并,其中使用与上面的散点索引类似的方案生成键:

    const auto merge_keys1_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            return batch_id * 2;
        });
    const auto merge_keys2_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            return batch_id * 2 + 1;
        });
    thrust::merge_by_key(
        merge_keys1_it, merge_keys1_it + arr1.size(),
        merge_keys2_it, merge_keys2_it + arr2.size(),
        arr1.cbegin(),
        arr2.cbegin(),
        thrust::make_discard_iterator(),
        arr3.begin());
    

    这可行并且相对优雅,但对于性能来说可能并不理想。我稍后会添加一个性能更高的版本。

完整源代码:

#include <iostream>
#include <iterator>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

#include <thrust/scatter.h>
#include <thrust/merge.h>

#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>

template <typename T>
void print(const thrust::device_vector<T> &vec) {
    thrust::host_vector<int> h_vec(vec);
    std::ostream_iterator<int> out_it(std::cout, ", ");
    thrust::copy(h_vec.cbegin(), h_vec.cend(),
                 out_it);
    std::cout << '\n';
}

void part1() {
    constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
    constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
    constexpr int batch = 3;

    thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
    thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
    
    assert(arr1.size() == 2 * arr2.size());

#if 1
    // specialized version where arr2 is scattered to "uneven" batches
    const auto scatter_indices_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            const int elem_id = idx % batch;
            return (batch_id * 2 + 1) * batch + elem_id;
        });
#else
    // more general version where arr2 is scattered to batches given in indices
    constexpr int h_indices[] = {1, 3};
    thrust::device_vector<int> indices(std::cbegin(h_indices), std::cend(h_indices));
    const auto indices_ptr = indices.data();

    const auto scatter_indices_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch, indices_ptr]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            const int elem_id = idx % batch;
            return indices_ptr[batch_id] * batch + elem_id;
        });
#endif
    
    thrust::scatter(
        arr2.cbegin(), arr2.cend(),
        scatter_indices_it,
        arr1.begin());

    print(arr1);
}

void part2() {
    constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6};
    constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
    constexpr int batch = 3;

    thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
    thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
    thrust::device_vector<int> arr3(arr1.size() + arr2.size());

    assert(arr1.size() == arr2.size());

    const auto merge_keys1_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            return batch_id * 2;
        });
    const auto merge_keys2_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            return batch_id * 2 + 1;
        });
    
    thrust::merge_by_key(
        merge_keys1_it, merge_keys1_it + arr1.size(),
        merge_keys2_it, merge_keys2_it + arr2.size(),
        arr1.cbegin(),
        arr2.cbegin(),
        thrust::make_discard_iterator(),
        arr3.begin());

    print(arr3);
}

int main(void) {
    part1();
    part2();
}
© www.soinside.com 2019 - 2024. All rights reserved.