在cuda11.7中拦截cuda内存管理相关API，遇到无效设备上下文（201）错误

Question

我实现了一个拦截库，拦截驱动程序 API

cuMemAlloc()

和

cuGetProcAddress()

然后转发它。然后我将

LD_LIBRARY_PATH

设置为截取库的路径并运行用运行时 API 编写的矩阵乘法程序。当程序调用

cudaMalloc ()

时，内部会调用驱动层API

cuGetProcAddress()

获取

cuMemAlloc()

的函数地址来调用

cuMemAlloc()

，所以我拦截了cuGetProcAddress，将其获取的cuMemAlloc函数地址改为

cuMemAlloc

我的拦截库的函数地址，此时可以拦截该函数，但是通过

dlsym

调用原库函数后返回201错误。我怎么解决这个问题？我的拦截过程是否正确？我的代码如下

挂钩库

#include <stdio.h>
#include <dlfcn.h>
#include <string.h>
#include <stdint.h>

typedef enum cudaError_enum {
    CUDA_SUCCESS                              = 0,
    //...
    //...Not copied completely from my code
    //...
    CUDA_ERROR_UNKNOWN                        = 999
} CUresult;

typedef unsigned long long CUdeviceptr_v2;
typedef CUdeviceptr_v2 CUdeviceptr;
typedef uint64_t cuuint64_t;
char *cuda_filename = "libcuda.so.515.65.01";


CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize){
        printf("hijacking cuMemAlloc!\n");
        CUresult (*hello)(CUdeviceptr *, size_t);
        CUresult ret;
        void *table = NULL;

        table = dlopen(cuda_filename, RTLD_NOW | RTLD_NODELETE);
        if (!table) {
                printf("Error can't find library %s", cuda_filename);
        }
        hello = (CUresult (*)(CUdeviceptr *, size_t))dlsym(table, "cuMemAlloc");
        if (!hello){
                printf("can't find function cuMemAlloc");
        }
        ret = hello(dptr, bytesize);


        return ret;
}


CUresult cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags){
        //printf("hijacking cuGetProcAddress!\n");
        CUresult (*hello)(const char *, void **, int, cuuint64_t);
        CUresult ret;
        void *table = NULL;

        table = dlopen(cuda_filename, RTLD_NOW | RTLD_NODELETE);
        if (!table) {
                printf("Error can't find library %s", cuda_filename);
        }
        hello = (CUresult (*)(const char *, void **, int, cuuint64_t))dlsym(table, "cuGetProcAddress");
        if (!hello){
                printf("can't find function cuGetProcAddress");
        }
        ret = hello(symbol, pfn, cudaVersion, flags);
        if (!strcmp(symbol, "cuGetProcAddress"))
                *pfn = cuGetProcAddress;
        if (!strcmp(symbol, "cuMemAlloc"))
                *pfn = cuMemAlloc;

        return ret;
}

编译并导出

gcc hook.c -fPIC -shared -ldl -o libcuda.so.1
export LD_LIBRARY_PATH=$PWD

运行时应用程序

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <sys/time.h>
#include <stdio.h>
#include <math.h>

const int Row=2048;
const int Col=2048;

__global__
void matrix_mul_gpu(int *M, int* N, int* P, int width)
{
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    int j = threadIdx.y + blockDim.y * blockIdx.y;

    int sum = 0;
    for(int k=0;k<width;k++)
    {
        int a = M[j*width+k];
        int b = N[k*width+i];
        sum += a*b;
    }
    P[j*width+i] = sum;
}

int main()
{
            cudaError_t cuda_err = cudaSuccess;
            printf("func start \n");
            int *A = (int *)malloc(sizeof(int) * Row * Col);
            int *B = (int *)malloc(sizeof(int) * Row * Col);
            int *C = (int *)malloc(sizeof(int) * Row * Col);
            //malloc device memory
            int *d_dataA, *d_dataB, *d_dataC;
            printf("before cudaMalloc()\n");
            cuda_err = cudaMalloc((void**)&d_dataA, sizeof(int) *Row*Col);
            //cuda_err = cudaGetLastError();
            if (cudaSuccess != cuda_err)
            {
                    fprintf(stderr, "(%s:%s:%d)",  __FILE__, __FUNCTION__, __LINE__);
                    fprintf(stderr, "%s\n", cudaGetErrorString(cuda_err));
                    printf("cuda_err is %d\n", cuda_err);
                    exit(1);
            }
            printf("after cudaMalloc()\n");
            cudaMalloc((void**)&d_dataB, sizeof(int) *Row*Col);
            cudaMalloc((void**)&d_dataC, sizeof(int) *Row*Col);
            //set value
            for (int i = 0; i < Row*Col; i++) {
                A[i] = 90;
                B[i] = 10;
            }

            cudaMemcpy(d_dataA, A, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
            cudaMemcpy(d_dataB, B, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
            dim3 threadPerBlock(16, 16);
            dim3 blockNumber((Col+threadPerBlock.x-1)/ threadPerBlock.x, (Row+threadPerBlock.y-1)/ threadPerBlock.y );
            matrix_mul_gpu << <blockNumber, threadPerBlock >> > (d_dataA, d_dataB, d_dataC, Col);
            cudaDeviceSynchronize();
            cudaMemcpy(C, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);
            free(A);
            free(B);
            free(C);
            cudaFree(d_dataA);
            cudaFree(d_dataB);
            cudaFree(d_dataC);

    return 0;
}

错误输出

nvcc matri.cu -o matri.out
./matri.out

func start
before cudaMalloc()
hijacking cuMemAlloc!
(matri.cu:main:40)invalid device context
cuda_err is 201

Answer 1

我也遇到了同样的问题。然后我尝试挂钩 cuMemAlloc_v2 而不是 cuMemAlloc，它似乎有效。但我不知道为什么它有效。哈哈

在cuda11.7中拦截cuda内存管理相关API，遇到无效设备上下文（201）错误

问题描述投票：0回答：1

1个回答

最新问题

在cuda11.7中拦截cuda内存管理相关API，遇到无效设备上下文（201）错误

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1