如何在 Python 中将函数指针传递给 cuPy Raw 内核?

问题描述 投票:0回答:1

我正在使用 cuPy 在 Python 脚本中调用原始 CUDA 内核。我能够在 Python 脚本中加载简单的独立 CUDA 内核,但如果我的 CUDA 内核需要函数指针作为参数,我不知道要使用的语法。如何将函数指针传递给设备函数?

需要函数指针作为参数的示例 CUDA 内核:

extern "C" 
{
    // substraction
    __device__ float substractValues(float a, float b)
    {
        float value = a - b;
        return value;
    }
    typedef float(*FuncPtrSubstraction)(float, float, float, float, float);
    __device__ FuncPtrSubstraction d_ptrSubstraction = substractValues;

    // addition
    __device__ float addValues(float a, float b)
    {
        float sum = a + b;
        return sum;
    }
    typedef float(*FuncPtrAddition)(float, float, float, float, float);
    __device__ FuncPtrAddition d_ptrAddition = addValues;

    // main Kernel
    __global__ void applyMatricesOperation(float (*funcPtrOperation)(float, float),
        const float* A, 
        const float* B, 
        float* C, 
        int rows, 
        int cols) 
    {
        int col = blockIdx.x * blockDim.x + threadIdx.x;
        int row = blockIdx.y * blockDim.y + threadIdx.y;

        if (col < cols && row < rows) 
        {
            int index = row * cols + col;
            C[index] = funcPtrOperation(A[index] , B[index]); // A[index] + B[index]; OR A[index] - B[index]
            printf("computed...\n");
        }
    }
}

更新:我尝试使用cuPy

RawModule()
加载我的CUDA代码,然后使用
get_global()
在设备端获取我已经定义的函数指针。代码正在运行但是我在尝试读取结果时遇到异常

cudaErrorIllegalAddress:遇到非法内存访问

import cupy as cp

# Define the CUDA kernel code
kernel_code = """
extern "C" {
    __device__ float addValues(float a, float b)
    {
        float sum = a + b;
        return sum;
    }
    typedef float(*FuncPtrAddition)(float, float);
    __device__ FuncPtrAddition d_ptrAddition = addValues;

    __global__ void addMatrices(float (*funcPtrAddValues)(float, float),
        const float* A,
        const float* B,
        float* C,
        int rows,
        int cols)
    {
        int col = blockIdx.x * blockDim.x + threadIdx.x;
        int row = blockIdx.y * blockDim.y + threadIdx.y;
        printf("computed...");

        if (col < cols && row < rows) 
        {
            int index = row * cols + col;
            C[index] = funcPtrAddValues(A[index], B[index]);
        }
    }
}
"""

# Create a RawModule from the kernel code
raw_module = cp.RawModule(code=kernel_code)

# Load the addMatrices kernel from the module
addMatrices_kernel = raw_module.get_function("addMatrices")

# Define the dimensions for your matrix
rows, cols = 3, 3

# Allocate device memory for input and output arrays
A = cp.random.rand(rows, cols).astype(cp.float32)
B = cp.random.rand(rows, cols).astype(cp.float32)
C = cp.empty((rows, cols), dtype=cp.float32)

# Define grid and block dimensions
block_dim = (3, 3)
grid_dim = (rows // block_dim[0], cols // block_dim[1])

# Launch the kernel, passing the function pointer as an argument
funcPtr = raw_module.get_global("d_ptrAddition")
addMatrices_kernel(grid_dim, block_dim, (funcPtr, A, B, C, rows, cols))

# Copy the result back to the host
result = C.get()
print(result)
python cuda cupy
1个回答
0
投票

@talonmies 是对的,你不能将函数指针作为内核参数传递。

但是,如果您可以确保这些内核和设备功能保存在同一个 CUmodule 中(用 CuPy 的术语来说,将它们保存在同一个

RawModule
中)并使用 C++17 来编译它们,模板就可以来救援!

import cupy as cp


code = r"""
__device__ int a_plus_b(const int& a, const int& b) {
    return a + b;
}

__device__ int a_minus_b(const int& a, const int& b) {
    return a - b;
}

template<auto Func>
__global__ void my_kernel(const int* a, const int* b, int* out, int N) {
    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        out[tid] = Func(a[tid], b[tid]);
    }
}
"""

mod = cp.RawModule(code=code,
                   options=("-std=c++17",),
                   name_expressions=('my_kernel<a_plus_b>', 'my_kernel<a_minus_b>'))
plus_ker = mod.get_function('my_kernel<a_plus_b>')
minus_ker = mod.get_function('my_kernel<a_minus_b>')

a = cp.arange(5, dtype=cp.int32)
b = cp.arange(5, dtype=cp.int32) + 5
c = cp.empty_like(a)
plus_ker((1,), (5,), (a, b, c, 5))
print(f"{c=}")
minus_ker((1,), (5,), (a, b, c, 5))
print(f"{c=}")

输出:

c=array([ 5,  7,  9, 11, 13], dtype=int32)
c=array([-5, -5, -5, -5, -5], dtype=int32)
© www.soinside.com 2019 - 2024. All rights reserved.