CUDA 排序 Z 轴 3D 数组 C++/Thrust

Question

我正在寻找沿 z 轴对大型 3D 数组进行排序。

示例数组为 X x Y x Z (1000x1000x5)

我想沿 z 轴排序，因此我会对沿 z 轴的 5 个元素执行 1000x1000 排序。

编辑更新：尝试使用下面的推力。它很实用，我会将输出存储回来，但这非常慢，因为我每次对每个 (x,y) 位置排序 5 个元素：

#include <stdio.h>
#include <stdlib.h>
#include <iostream>

#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>

int main(){
int x = 1000, y = 1000, z = 5;
float*** unsorted_cube = new float** [x];

for (int i = 0; i < x; i++) 
{
    // Allocate memory blocks for 
    // rows of each 2D array 
    unsorted_cube[i] = new float* [y];

    for (int j = 0; j < y; j++) 
    {
        // Allocate memory blocks for 
        // columns of each 2D array 
        unsorted_cube[i][j] = new float[z];
    }
}


for (int i = 0; i < x; i++)
{
    for (int j = 0; j < y; j++)
    {
        unsorted_cube[i][j][0] = 4.0f;
        unsorted_cube[i][j][1] = 3.0f;
        unsorted_cube[i][j][2] = 1.0f;
        unsorted_cube[i][j][3] = 5.0f;
        unsorted_cube[i][j][4] = 2.0f;
    }
}

for (int i = 0; i < 5; i++)
{
    printf("unsorted_cube first 5 elements to sort at (0,0): %f\n", unsorted_cube[0][0][i]);
}

float* temp_input;
float* temp_output;
float* raw_ptr;
float raw_ptr_out[5];
cudaMalloc((void**)&raw_ptr, N_Size * sizeof(float));
for (int i = 0; i < x; i++)
{ 
    for (int j = 0; j < y; j++)
    {
        temp_input[0] = unsorted_cube[i][j][0];
        temp_input[1] = unsorted_cube[i][j][1];
        temp_input[2] = unsorted_cube[i][j][2];
        temp_input[3] = unsorted_cube[i][j][3];
        temp_input[4] = unsorted_cube[i][j][4];

        cudaMemcpy(raw_ptr, temp_input, 5 * sizeof(float), cudaMemcpyHostToDevice);
        thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(raw_ptr);
        thrust::sort(dev_ptr, dev_ptr + 5);
        thrust::host_vector<float> host_vec(5);
        thrust::copy(dev_ptr, dev_ptr + 5, raw_ptr_out);

        if (i == 0 && j == 0)
        {
            for (int i = 0; i < 5; i++)
            {
                temp_output[i] = raw_ptr_out[i];
            }
            printf("sorted_cube[0,0,0] : %f\n", temp_output[0]);
            printf("sorted_cube[0,0,1] : %f\n", temp_output[1]);
            printf("sorted_cube[0,0,2] : %f\n", temp_output[2]);
            printf("sorted_cube[0,0,3] : %f\n", temp_output[3]);
            printf("sorted_cube[0,0,4] : %f\n", temp_output[4]);
        }
    }
}
}

Answer 1

假设数据采用的格式中每个 xy 平面中的值在内存中是连续的：

data[((z * y_length) + y) * x_length + x]

（这也最适合在 GPU 上合并内存访问）

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sort.h>
#include <thrust/zip_function.h>

void sort_in_z_dir(thrust::device_vector<float> &data, int x_length,
                   int y_length) {  // z_length == 5
    auto z_stride = x_length * y_length;

    thrust::for_each_n(
        thrust::make_zip_iterator(
            data.begin() + 0 * z_stride,
            data.begin() + 1 * z_stride,
            data.begin() + 2 * z_stride,
            data.begin() + 3 * z_stride,
            data.begin() + 4 * z_stride),
        z_stride,
        thrust::make_zip_function([] __host__ __device__(float a, float b,
                                                         float c, float d,
                                                         float e) {
            float local_data[] = {a, b, c, d, e};
            thrust::sort(
                thrust::seq, local_data,
                local_data + sizeof(local_data) / sizeof(local_data[0]));
            a = local_data[0];
            b = local_data[1];
            c = local_data[2];
            d = local_data[3];
            e = local_data[4];
        }));
}

这个解决方案在硬编码方面确实非常丑陋

z_length

。人们可以使用一些 C++ 模板“魔法”将

z_length

变成模板参数，但这对于这个关于 Thrust 的答案似乎有点过分了。

有关数组和元组之间接口的示例，请参阅将 std::tuple 转换为 std::array C++11 和如何将 std::array 转换为 std::tuple？。

这个解决方案的好处是，就排序算法本身而言，它应该在性能方面几乎是最佳的。我不知道

thrust::sort

 是否针对如此小的输入数组进行了优化，但是您可以按照我在评论中建议的那样用任何自己编写的排序算法替换它。

如果您希望能够使用不同的

z_length

 而没有所有这些麻烦，您可能更喜欢这个解决方案，它在全局内存中排序，这远非最佳，并且感觉有点 hacky，因为它几乎只使用 Thrust 来启动一个内核。在这里，您希望数据以相反的方式排序：

data[((x * y_length) + y) * z_length + z]

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/sort.h>

void sort_in_z_dir_alternative(thrust::device_vector<float> &data, int x_length,
                               int y_length, int z_length) {
    int n_threads = x_length * y_length;
    float *ddata = thrust::raw_pointer_cast(data.data());
    thrust::for_each(thrust::make_counting_iterator(0),
                     thrust::make_counting_iterator(n_threads),
                     [ddata, z_length] __host__ __device__(int idx) {
                         thrust::sort(thrust::seq,
                                      ddata + z_length * idx,
                                      ddata + z_length * (idx + 1));
                     });
}

如果您同意

z_length

 作为模板参数，这可能是一个结合了两个世界最好的解决方案（数据格式如第一个示例）：

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/sort.h>

template <int z_length>
void sort_in_z_dir_middle_ground(thrust::device_vector<float> &data,
                                 int x_length, int y_length) {
    int n_threads = x_length * y_length;  // == z_stride
    float *ddata = thrust::raw_pointer_cast(data.data());

    thrust::for_each(thrust::make_counting_iterator(0),
                     thrust::make_counting_iterator(n_threads),
                     [ddata, n_threads] __host__ __device__(int idx) {
                         float local_data[z_length];
#pragma unroll
                         for (int i = 0; i < z_length; ++i) {
                             local_data[i] = ddata[idx + i * n_threads];
                         }
                         thrust::sort(thrust::seq,
                                      local_data,
                                      local_data + z_length);
#pragma unroll
                         for (int i = 0; i < z_length; ++i) {
                             ddata[idx + i * n_threads] = local_data[i];
                         }
                     });
}

CUDA 排序 Z 轴 3D 数组 C++/Thrust

问题描述投票：0回答：1

1个回答

最新问题

CUDA 排序 Z 轴 3D 数组 C++/Thrust

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1