我正在使用 cuPy 在 Python 脚本中调用原始 CUDA 内核。我能够在 Python 脚本中加载简单的独立 CUDA 内核,但如果我的 CUDA 内核需要函数指针作为参数,我不知道要使用的语法。如何将函数指针传递给设备函数?
需要函数指针作为参数的示例 CUDA 内核:
extern "C"
{
// substraction
__device__ float substractValues(float a, float b)
{
float value = a - b;
return value;
}
typedef float(*FuncPtrSubstraction)(float, float, float, float, float);
__device__ FuncPtrSubstraction d_ptrSubstraction = substractValues;
// addition
__device__ float addValues(float a, float b)
{
float sum = a + b;
return sum;
}
typedef float(*FuncPtrAddition)(float, float, float, float, float);
__device__ FuncPtrAddition d_ptrAddition = addValues;
// main Kernel
__global__ void applyMatricesOperation(float (*funcPtrOperation)(float, float),
const float* A,
const float* B,
float* C,
int rows,
int cols)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (col < cols && row < rows)
{
int index = row * cols + col;
C[index] = funcPtrOperation(A[index] , B[index]); // A[index] + B[index]; OR A[index] - B[index]
printf("computed...\n");
}
}
}
更新:我尝试使用cuPy的
RawModule()
加载我的CUDA代码,然后使用get_global()
在设备端获取我已经定义的函数指针。代码正在运行但是我在尝试读取结果时遇到异常
cudaErrorIllegalAddress:遇到非法内存访问
import cupy as cp
# Define the CUDA kernel code
kernel_code = """
extern "C" {
__device__ float addValues(float a, float b)
{
float sum = a + b;
return sum;
}
typedef float(*FuncPtrAddition)(float, float);
__device__ FuncPtrAddition d_ptrAddition = addValues;
__global__ void addMatrices(float (*funcPtrAddValues)(float, float),
const float* A,
const float* B,
float* C,
int rows,
int cols)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
printf("computed...");
if (col < cols && row < rows)
{
int index = row * cols + col;
C[index] = funcPtrAddValues(A[index], B[index]);
}
}
}
"""
# Create a RawModule from the kernel code
raw_module = cp.RawModule(code=kernel_code)
# Load the addMatrices kernel from the module
addMatrices_kernel = raw_module.get_function("addMatrices")
# Define the dimensions for your matrix
rows, cols = 3, 3
# Allocate device memory for input and output arrays
A = cp.random.rand(rows, cols).astype(cp.float32)
B = cp.random.rand(rows, cols).astype(cp.float32)
C = cp.empty((rows, cols), dtype=cp.float32)
# Define grid and block dimensions
block_dim = (3, 3)
grid_dim = (rows // block_dim[0], cols // block_dim[1])
# Launch the kernel, passing the function pointer as an argument
funcPtr = raw_module.get_global("d_ptrAddition")
addMatrices_kernel(grid_dim, block_dim, (funcPtr, A, B, C, rows, cols))
# Copy the result back to the host
result = C.get()
print(result)
@talonmies 是对的,你不能将函数指针作为内核参数传递。
但是,如果您可以确保这些内核和设备功能保存在同一个 CUmodule 中(用 CuPy 的术语来说,将它们保存在同一个
RawModule
中)并使用 C++17 来编译它们,模板就可以来救援!
import cupy as cp
code = r"""
__device__ int a_plus_b(const int& a, const int& b) {
return a + b;
}
__device__ int a_minus_b(const int& a, const int& b) {
return a - b;
}
template<auto Func>
__global__ void my_kernel(const int* a, const int* b, int* out, int N) {
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
out[tid] = Func(a[tid], b[tid]);
}
}
"""
mod = cp.RawModule(code=code,
options=("-std=c++17",),
name_expressions=('my_kernel<a_plus_b>', 'my_kernel<a_minus_b>'))
plus_ker = mod.get_function('my_kernel<a_plus_b>')
minus_ker = mod.get_function('my_kernel<a_minus_b>')
a = cp.arange(5, dtype=cp.int32)
b = cp.arange(5, dtype=cp.int32) + 5
c = cp.empty_like(a)
plus_ker((1,), (5,), (a, b, c, 5))
print(f"{c=}")
minus_ker((1,), (5,), (a, b, c, 5))
print(f"{c=}")
输出:
c=array([ 5, 7, 9, 11, 13], dtype=int32)
c=array([-5, -5, -5, -5, -5], dtype=int32)