如何在 Python 中将函数指针传递给 cuPy Raw 内核？

Question

我正在使用 cuPy 在 Python 脚本中调用原始 CUDA 内核。我能够在 Python 脚本中加载简单的独立 CUDA 内核，但如果我的 CUDA 内核需要函数指针作为参数，我不知道要使用的语法。如何将函数指针传递给设备函数？

需要函数指针作为参数的示例 CUDA 内核：

extern "C" 
{
    // substraction
    __device__ float substractValues(float a, float b)
    {
        float value = a - b;
        return value;
    }
    typedef float(*FuncPtrSubstraction)(float, float, float, float, float);
    __device__ FuncPtrSubstraction d_ptrSubstraction = substractValues;

    // addition
    __device__ float addValues(float a, float b)
    {
        float sum = a + b;
        return sum;
    }
    typedef float(*FuncPtrAddition)(float, float, float, float, float);
    __device__ FuncPtrAddition d_ptrAddition = addValues;

    // main Kernel
    __global__ void applyMatricesOperation(float (*funcPtrOperation)(float, float),
        const float* A, 
        const float* B, 
        float* C, 
        int rows, 
        int cols) 
    {
        int col = blockIdx.x * blockDim.x + threadIdx.x;
        int row = blockIdx.y * blockDim.y + threadIdx.y;

        if (col < cols && row < rows) 
        {
            int index = row * cols + col;
            C[index] = funcPtrOperation(A[index] , B[index]); // A[index] + B[index]; OR A[index] - B[index]
            printf("computed...\n");
        }
    }
}

更新：我尝试使用cuPy的

RawModule()

加载我的CUDA代码，然后使用

get_global()

在设备端获取我已经定义的函数指针。代码正在运行但是我在尝试读取结果时遇到异常

cudaErrorIllegalAddress：遇到非法内存访问

import cupy as cp

# Define the CUDA kernel code
kernel_code = """
extern "C" {
    __device__ float addValues(float a, float b)
    {
        float sum = a + b;
        return sum;
    }
    typedef float(*FuncPtrAddition)(float, float);
    __device__ FuncPtrAddition d_ptrAddition = addValues;

    __global__ void addMatrices(float (*funcPtrAddValues)(float, float),
        const float* A,
        const float* B,
        float* C,
        int rows,
        int cols)
    {
        int col = blockIdx.x * blockDim.x + threadIdx.x;
        int row = blockIdx.y * blockDim.y + threadIdx.y;
        printf("computed...");

        if (col < cols && row < rows) 
        {
            int index = row * cols + col;
            C[index] = funcPtrAddValues(A[index], B[index]);
        }
    }
}
"""

# Create a RawModule from the kernel code
raw_module = cp.RawModule(code=kernel_code)

# Load the addMatrices kernel from the module
addMatrices_kernel = raw_module.get_function("addMatrices")

# Define the dimensions for your matrix
rows, cols = 3, 3

# Allocate device memory for input and output arrays
A = cp.random.rand(rows, cols).astype(cp.float32)
B = cp.random.rand(rows, cols).astype(cp.float32)
C = cp.empty((rows, cols), dtype=cp.float32)

# Define grid and block dimensions
block_dim = (3, 3)
grid_dim = (rows // block_dim[0], cols // block_dim[1])

# Launch the kernel, passing the function pointer as an argument
funcPtr = raw_module.get_global("d_ptrAddition")
addMatrices_kernel(grid_dim, block_dim, (funcPtr, A, B, C, rows, cols))

# Copy the result back to the host
result = C.get()
print(result)

Answer 1

@talonmies 是对的，你不能将函数指针作为内核参数传递。

但是，如果您可以确保这些内核和设备功能保存在同一个 CUmodule 中（用 CuPy 的术语来说，将它们保存在同一个

RawModule

中）并使用 C++17 来编译它们，模板就可以来救援！

import cupy as cp


code = r"""
__device__ int a_plus_b(const int& a, const int& b) {
    return a + b;
}

__device__ int a_minus_b(const int& a, const int& b) {
    return a - b;
}

template<auto Func>
__global__ void my_kernel(const int* a, const int* b, int* out, int N) {
    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        out[tid] = Func(a[tid], b[tid]);
    }
}
"""

mod = cp.RawModule(code=code,
                   options=("-std=c++17",),
                   name_expressions=('my_kernel<a_plus_b>', 'my_kernel<a_minus_b>'))
plus_ker = mod.get_function('my_kernel<a_plus_b>')
minus_ker = mod.get_function('my_kernel<a_minus_b>')

a = cp.arange(5, dtype=cp.int32)
b = cp.arange(5, dtype=cp.int32) + 5
c = cp.empty_like(a)
plus_ker((1,), (5,), (a, b, c, 5))
print(f"{c=}")
minus_ker((1,), (5,), (a, b, c, 5))
print(f"{c=}")

输出：

c=array([ 5,  7,  9, 11, 13], dtype=int32)
c=array([-5, -5, -5, -5, -5], dtype=int32)

如何在 Python 中将函数指针传递给 cuPy Raw 内核？

问题描述投票：0回答：1

1个回答

最新问题

如何在 Python 中将函数指针传递给 cuPy Raw 内核？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1