PyCuda中如何使用共享内存,LogicError: cuModuleLoadDataEx failed:遇到非法内存访问

问题描述 投票:0回答:1

我试图了解如何使用 PyCuda 处理共享内存。运行此代码来翻转输入向量:

import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

n = 20

input = np.random.randint(10, size=(n))
output = np.zeros_like(input)

input = input.astype(np.int32)
output = output.astype(np.int32)

mod = SourceModule(
'''
  __global__ void flipVectorSM(int* in, int* out, int n) {
    extern __shared__ int sData[];
    int inOffSet = blockDim.x * blockIdx.x;
    int index = inOffSet + threadIdx.x;
    if (index < n) {
        sData[blockDim.x - 1 - threadIdx.x] = in[index];
        __syncthreads();
    }
    int outOffSet = blockDim.x * (gridDim.x - 1 - blockIdx.x);
    int outIndex = outOffSet + threadIdx.x;
    out[outIndex] = sData[threadIdx.x];
  }
'''
)

flip = mod.get_function('flipVectorSM')
flip(drv.In(input), drv.InOut(output), np.int32(n), block=(4, 1, 1), grid=(1, 1), shared=4)

我收到此错误:

---------------------------------------------------------------------------
LogicError                                Traceback (most recent call last)
<ipython-input-114-5b681ffa31fc> in <cell line: 15>()
     13 output = output.astype(np.int32)
     14 
---> 15 mod = SourceModule(
     16 '''
     17   __global__ void flipVectorSM(int* in, int* out, int n) {

/usr/local/lib/python3.10/dist-packages/pycuda/compiler.py in __init__(self, source, nvcc, options, keep, no_extern_c, arch, code, cache_dir, include_dirs)
    367         from pycuda.driver import module_from_buffer
    368 
--> 369         self.module = module_from_buffer(cubin)
    370 
    371         self._bind_module()

LogicError: cuModuleLoadDataEx failed: an illegal memory access was encountered - 

我使用了有关全局和共享内存的代码段,我已将其用于相同的代码,但使用 CUDA-C 并且它可以工作。我该如何解决?

python cuda gpu gpgpu pycuda
1个回答
0
投票

好的,这个配置可以工作,谢谢。

import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

n = 20

input = np.random.randint(10, size=(n))
output = np.zeros_like(input)

input = input.astype(np.int32)
output = output.astype(np.int32)

mod = SourceModule(
'''
  __global__ void flipVectorSM(int* in, int* out, int n) {
    extern __shared__ int sData[];
    int inOffSet = blockDim.x * blockIdx.x;
    int index = inOffSet + threadIdx.x;
    if (index < n) {
        sData[blockDim.x - 1 - threadIdx.x] = in[index];
        __syncthreads();
    }
    int outOffSet = blockDim.x * (gridDim.x - 1 - blockIdx.x);
    int outIndex = outOffSet + threadIdx.x;
    out[outIndex] = sData[threadIdx.x];
  }
'''
)

flip = mod.get_function('flipVectorSM')
flip(drv.In(input), drv.InOut(output), np.int32(n), block=(20, 1, 1), grid=(1, 1), shared=20)

print("Input vector:")
print(input)
print("\nOutput vector:")
print(output)
© www.soinside.com 2019 - 2024. All rights reserved.