如何在cudaTextureObject_t中使用CUDA tex1DFetch？

Question

当我注意到它们被弃用时，我正在使用纹理引用，我尝试更新我的测试函数以使用tex1Dfetch处理'new'无绑定纹理对象，但是无法产生相同的结果。

我正在探索使用纹理内存来加速我的aho-corasick实现;我能够让tex1D()使用纹理参考，但是，我注意到它们已被弃用并决定使用纹理对象。

当我尝试以任何方式使用结果时，我对内核有一些非常奇怪的行为;我可以毫无问题地做results[tidx] = tidx;，但results[tidx] = temp + 1;只返回temp的值而不是temp * 3或任何其他涉及temp的数值测试。

我没有看到这种行为的逻辑原因，文档示例看起来足够相似，我无法看到我出错的地方。

我已经读过CUDA tex1Dfetch（）错误的行为和新的CUDA纹理对象 - 在2D情况下得到错误的数据但似乎与我遇到的问题无关。

以防它有所作为;我正在使用CUDA 10.0版，V10.0.130和Nvidia GTX 980ti。

#include <iostream>

__global__ void test(cudaTextureObject_t tex ,int* results){
    int tidx = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned temp = tex1Dfetch<unsigned>(tex, threadIdx.x);
    results[tidx] = temp * 3;
}

int main(){
    int *host_arr;
    const int host_arr_size = 8;

    // Create and populate host array
    std::cout << "Host:" << std::endl;
    cudaMallocHost(&host_arr, host_arr_size*sizeof(int));
    for (int i = 0; i < host_arr_size; ++i){
        host_arr[i] = i * 2;
        std::cout << host_arr[i] << std::endl;
    }

    // Create resource description
    struct cudaResourceDesc resDesc;
    resDesc.resType = cudaResourceTypeLinear;
    resDesc.res.linear.devPtr = &host_arr;
    resDesc.res.linear.sizeInBytes = host_arr_size*sizeof(unsigned);
    resDesc.res.linear.desc = cudaCreateChannelDesc<unsigned>();
    // Create texture description
    struct cudaTextureDesc texDesc;
    texDesc.readMode = cudaReadModeElementType;
    // Create texture
    cudaTextureObject_t tex;
    cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);

    // Allocate results array
    int * result_arr;
    cudaMalloc(&result_arr, host_arr_size*sizeof(unsigned));

    // launch test kernel
    test<<<1, host_arr_size>>>(tex, result_arr);

    // fetch results
    std::cout << "Device:" << std::endl;
    cudaMemcpy(host_arr, result_arr, host_arr_size*sizeof(unsigned), cudaMemcpyDeviceToHost);
    // print results
    for (int i = 0; i < host_arr_size; ++i){
        std::cout << host_arr[i] << std::endl;
    }

    // Tidy Up
    cudaDestroyTextureObject(tex);
    cudaFreeHost(host_arr);
    cudaFree(result_arr);
}

我期望上面的工作类似于下面（它确实有效）：


texture<int, 1, cudaReadModeElementType> tex_ref;
cudaArray* cuda_array;

__global__ void test(int* results){
    const int tidx = threadIdx.x;
    results[tidx] = tex1D(tex_ref, tidx) * 3;
}

int main(){
    int *host_arr;
    int host_arr_size = 8;

    // Create and populate host array
    cudaMallocHost((void**)&host_arr, host_arr_size * sizeof(int));
    for (int i = 0; i < host_arr_size; ++i){
        host_arr[i] = i * 2;
        std::cout << host_arr[i] << std::endl;
    }

    // bind to texture
    cudaChannelFormatDesc cuDesc = cudaCreateChannelDesc <int >();
    cudaMallocArray(&cuda_array, &cuDesc, host_arr_size);
    cudaMemcpyToArray(cuda_array, 0, 0, host_arr , host_arr_size * sizeof(int), cudaMemcpyHostToDevice);
    cudaBindTextureToArray(tex_ref , cuda_array);
    // Allocate results array
    int * result_arr;
    cudaMalloc((void**)&result_arr, host_arr_size*sizeof(int));

    // launch kernel
    test<<<1, host_arr_size>>>(result_arr);

    // fetch results
    cudaMemcpy(host_arr, result_arr, host_arr_size * sizeof(int), cudaMemcpyDeviceToHost);
    // print results
    for (int i = 0; i < host_arr_size; ++i){
        std::cout << host_arr[i] << std::endl;
    }

    // Tidy Up
    cudaUnbindTexture(tex_ref);
    cudaFreeHost(host_arr);
    cudaFreeArray(cuda_array);
    cudaFree(result_arr);
}

预期成绩：

Host:
0
2
4
6
8
10
12
14
Device:
0
6
12
18
24
30
36
42

实际结果：

Host:
0
2
4
6
8
10
12
14
Device:
0
2
4
6
8
10
12
14

有谁知道究竟出了什么问题？

Answer 1

CUDA API函数调用返回错误代码。您想要检查这些错误代码。特别是当某些地方出现问题时......

您使用相同的数组来存储初始数组数据以及从设备接收结果。由于您没有有效的纹理对象，因此内核启动失败并显示非法地址错误。您没有有效的纹理对象，因为纹理对象的创建失败。内核启动后的第一个API调用就是cudaMemcpy()来获取结果。由于在内核启动期间出现错误，cudaMemcpy()将失败，返回最近的错误而不是执行复制。因此，host_arr缓冲区的内容不会改变，您最终会再次显示原始输入数据。

在qazxsw poi（强调我的）中解释了创建纹理对象失败的原因：

如果cudaResourceDesc :: resType设置为cudaResourceTypeLinear，则cudaResourceDesc :: res :: linear :: devPtr必须设置为有效的设备指针，该指针与cudaDeviceProp :: textureAlignment对齐。 [...]

纹理对象无法引用主机内存。代码中的问题在于：

documentation

您需要在设备内存中分配缓冲区，例如，使用qazxsw poi，在那里复制数据，并创建引用该设备缓冲区的纹理对象。

此外，您的resDesc.res.linear.devPtr = &host_arr;未正确初始化。在您的情况下，只需将其初始化为零就足够了：

cudaMalloc()

如何在cudaTextureObject_t中使用CUDA tex1DFetch？

问题描述投票：0回答：1

1个回答

最新问题

如何在cudaTextureObject_t中使用CUDA tex1DFetch？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1