Thrust lower_bound 返回的迭代器总是指向第一个元素

Question

我编写了一个函数，用向量中最接近的值（~3500 个元素）替换二维矩阵 (1024x2048) 每一行的值。向量和每一行根据包含数据的拐点（预先计算）分为 4 个部分。 “最接近”是指同一区间内的值（数据呈波浪状，但我们可以保证区间内的唯一性）。使用的函数调用和函子如下所示（为简洁起见，省略了读取数据等）。

CUDA：11.5，推力：11.4，显卡：Geforce GTX 1650。

thrust::device_vector<float> output(1024*2048);
const thrust::device_vector<float> inputMat(1024*2048);
const thrust::device_vector<float> inputVec(3500);
auto it = thrust::max_element(inputVec.begin(), inputVec.end());
float inputVecMax = *it;
const thrust::tuple<int, int, int> vecSections;
const thrust::tuple<int, int, int> matSections;

thrust::transform(thrust::device,
  thrust::counting_iterator<int>(0),
  thrust::counting_iterator<int>(1024*2048),
  output.begin(),
  MyMap(
    inputMat.data(), 
    inputVec.data(),
    inputVecMax,
    inputVec.size(), 
    {thrust::get<0>(matSections), thrust::get<1>(matSections), thrust::get<2>(matSections)}, 
    {thrust::get<0>(vecSections), thrust::get<1>(vecSections), thrust::get<2>(vecSections)})));

struct MyMap
{
  __host__ __device__ MyMap(
    const thrust::device_ptr<const float> mat, 
    const thrust::device_ptr<const float> vec
    const float& vecMax,
    const int& vecSize, 
    const int3& matSectionIdxs, 
    const int3& vecSectionIdxs) : 
    m_mat(mat), 
    m_vec(vec),
    m_vecMax(vecMax),
    m_vecSize(vecSize),
    m_matSectionIdxs(matSectionIdxs),
    m_vecSectionIdxs(vecSectionIdxs){}
  __host__ __device__ float operator()(const int& idx) const
  {
    float valToSearch = m_mat[idx] / (1/m_vecMax);

    int col = idx % 2048;
    
    float v2 = cuda::std::numeric_limits<float>::max();
    thrust::device_ptr<const float> it;

    if (col <= m_matSectionIdxs.x)
    {
      it = thrust::lower_bound(thrust::device, m_vec, m_vec+m_vecSectionIdxs.x, valToSearch);
    }
    else if (col > m_matSectionIdxs.x && col <= m_matSectionIdxs.y)
    {
      it = thrust::lower_bound(thrust::device, m_vec+m_vecSectionIdxs.x, m_vec+m_vecSectionIdxs.y, valToSearch);
    }
    else if (col > m_matSectionIdxs.y && col <= m_matSectionIdxs.z)
    {
      it = thrust::lower_bound(thrust::device, m_vec+m_vecSectionIdxs.y, m_vec+m_vecSectionIdxs.z, valToSearch);
    }
    else
    {
      it = thrust::lower_bound(thrust::device, m_vec+m_vecSectionIdxs.z, m_vec+m_vecSize, valToSearch);
    }

    int dist = thrust::distance(m_vec, it);
    float v1 = m_vec[dist];
    if (dist+1 < m_vecSize)
    {
      v2 = m_vec[dist+1];
    }
    int index = dist + static_cast<int>(cuda::std::abs(valToSearch-v1) > cuda::std::abs(valToSearch-v2);
    return m_vec[index];
  }

  thrust::device_ptr<const float> m_mat;
  thrust::device_ptr<const float> m_vec;
  float m_vecMax;
  int m_vecSize;
  int3 m_matSectionIdxs;
  int3 m_vecSectionIdxs;
};

我已经确认了 if 语句之外的所有内容都正常工作。我的问题是

thrust::lower_bound

返回的迭代器总是指向第一个元素，因此所有 4 个

v1

和

v2

的值都是相同的。

inputVec

的值开始徘徊在 -17 到 +20 之间，但随后在第 2000 个元素附近飙升至超过 1000。

valToSearch

大部分都在900以上

我在

lower_bound

中使用了另一个函子只是为了查看它接收到的值（此处省略，因为它只是添加了一个

printf

）并且能够确认输入值是正确的。

我也试过用

lower_bound

替换

upper_bound

，只是想看看会发生什么，但我得到了相同的结果。

编辑：

感谢@paleonix 的风格建议。我更新了上面的代码以反映有关

const

、函数签名和缺少

的建议。

至于

constant_iterator

s，我最初尝试让他们成为

MyMap

的数据成员，但由于某种原因他们都评估为

。我还发现使用

m_mat[idx]

评估为

，但

*(m_mat+idx).get()

和

(m_mat+idx).get()[0]

评估正确。我想将重复的代码移出 if 语句，但我还没有找到

it

的类型应该是什么（没有找到类型名

ForwardIterator

，其他人也没有 [例如

thrust::iterator_adapter

] 工作）。对于

/ (1/vecMax)

，我只是想说明

valToSearch

是根据输入矩阵和一些输入常数计算出来的。那些省略的常量只在那个计算中使用。

编辑2：

当试图重新创建成员变量评估为

的问题时，我发现问题是我试图直接打印

m_mat[idx]

。我猜这是因为

m_mat[idx]

驻留在设备上，但

printf

需要数据在主机上。因此，如果我将

m_mat[idx]

分配给一个临时变量，然后打印该变量，它就可以正常工作。

~~当我试图将~~

it

定义为
thrust::device_ptr<float>
时，我得到了一个编译错误
error: no instance of function template "thrust::distance" matches the argument list
和
error: no instance of overloaded function "thrust::pointer<Element, Tag, Reference, Derived>::operator= [with Element=float, Tag=thrust::device_system_tag, Reference=thrust::device_reference<float>, Derived=thrust::device_ptr<float>]" matches the argument list
.

it

的正确类型是

thrust::device_ptr<const float>

.

尝试将

thrust::device

更改为

thrust::seq

，但这让我的程序崩溃了。

我添加了对

it+1

访问的检查，更正了

index = ... static_cast...

中的条件，更新了代码以提高可读性，并添加了我正在使用的 CUDA 和 Thrust 版本。

编辑3：

我写了一个更简单的程序来复制这个问题

thrust::lower_bound

:

#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/binary_search.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>

struct myLowerBound
{
  __host__ __device__ myLowerBound(const thrust::device_ptr<float> input) : m_input(input) {}
  __host__ __device__ int operator()(const int& idx) const
  {
    printf("Enter\n");
    auto it = thrust::lower_bound(thrust::device, m_input, m_input + 100, 25);
    printf("after1\n");
    int dist = thrust::distance(m_input, it);
    printf("after2\n");
    printf("dist: %d\n", dist);

    return dist;
  }

  thrust::device_ptr<float> m_input;
};

int main() {
    thrust::device_vector<float> input(100);
    thrust::sequence(thrust::device, input.begin(), input.end());
    float tmp = input[25];
    printf("input[25]: %f\n", tmp);

    thrust::device_vector<int> out(input.size());
    thrust::transform(
        thrust::device,
        thrust::counting_iterator<int>(0),
        thrust::counting_iterator<int>(input.size()),
        out.begin(),
        myLowerBound(input.data()));

    int out4 = out[4];
    printf("out4: %d\n", out4);
}

输出为：

...
Enter
input[25]: 25.000000
out4: 0

Thrust lower_bound 返回的迭代器总是指向第一个元素

问题描述投票：0回答：0

最新问题

Thrust lower_bound 返回的迭代器总是指向第一个元素

问题描述 投票：0回答：0

最新问题

问题描述投票：0回答：0