我实现了一个拦截库,拦截驱动程序 API
cuMemAlloc()
和 cuGetProcAddress()
然后转发它。然后我将 LD_LIBRARY_PATH
设置为截取库的路径并运行用运行时 API 编写的矩阵乘法程序。当程序调用cudaMalloc ()
时,内部会调用驱动层APIcuGetProcAddress()
获取cuMemAlloc()
的函数地址来调用cuMemAlloc()
,所以我拦截了cuGetProcAddress,将其获取的cuMemAlloc函数地址改为cuMemAlloc
我的拦截库的函数地址,此时可以拦截该函数,但是通过dlsym
调用原库函数后返回201错误。我怎么解决这个问题?我的拦截过程是否正确?
我的代码如下
#include <stdio.h>
#include <dlfcn.h>
#include <string.h>
#include <stdint.h>
typedef enum cudaError_enum {
CUDA_SUCCESS = 0,
//...
//...Not copied completely from my code
//...
CUDA_ERROR_UNKNOWN = 999
} CUresult;
typedef unsigned long long CUdeviceptr_v2;
typedef CUdeviceptr_v2 CUdeviceptr;
typedef uint64_t cuuint64_t;
char *cuda_filename = "libcuda.so.515.65.01";
CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize){
printf("hijacking cuMemAlloc!\n");
CUresult (*hello)(CUdeviceptr *, size_t);
CUresult ret;
void *table = NULL;
table = dlopen(cuda_filename, RTLD_NOW | RTLD_NODELETE);
if (!table) {
printf("Error can't find library %s", cuda_filename);
}
hello = (CUresult (*)(CUdeviceptr *, size_t))dlsym(table, "cuMemAlloc");
if (!hello){
printf("can't find function cuMemAlloc");
}
ret = hello(dptr, bytesize);
return ret;
}
CUresult cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags){
//printf("hijacking cuGetProcAddress!\n");
CUresult (*hello)(const char *, void **, int, cuuint64_t);
CUresult ret;
void *table = NULL;
table = dlopen(cuda_filename, RTLD_NOW | RTLD_NODELETE);
if (!table) {
printf("Error can't find library %s", cuda_filename);
}
hello = (CUresult (*)(const char *, void **, int, cuuint64_t))dlsym(table, "cuGetProcAddress");
if (!hello){
printf("can't find function cuGetProcAddress");
}
ret = hello(symbol, pfn, cudaVersion, flags);
if (!strcmp(symbol, "cuGetProcAddress"))
*pfn = cuGetProcAddress;
if (!strcmp(symbol, "cuMemAlloc"))
*pfn = cuMemAlloc;
return ret;
}
gcc hook.c -fPIC -shared -ldl -o libcuda.so.1
export LD_LIBRARY_PATH=$PWD
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <sys/time.h>
#include <stdio.h>
#include <math.h>
const int Row=2048;
const int Col=2048;
__global__
void matrix_mul_gpu(int *M, int* N, int* P, int width)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int sum = 0;
for(int k=0;k<width;k++)
{
int a = M[j*width+k];
int b = N[k*width+i];
sum += a*b;
}
P[j*width+i] = sum;
}
int main()
{
cudaError_t cuda_err = cudaSuccess;
printf("func start \n");
int *A = (int *)malloc(sizeof(int) * Row * Col);
int *B = (int *)malloc(sizeof(int) * Row * Col);
int *C = (int *)malloc(sizeof(int) * Row * Col);
//malloc device memory
int *d_dataA, *d_dataB, *d_dataC;
printf("before cudaMalloc()\n");
cuda_err = cudaMalloc((void**)&d_dataA, sizeof(int) *Row*Col);
//cuda_err = cudaGetLastError();
if (cudaSuccess != cuda_err)
{
fprintf(stderr, "(%s:%s:%d)", __FILE__, __FUNCTION__, __LINE__);
fprintf(stderr, "%s\n", cudaGetErrorString(cuda_err));
printf("cuda_err is %d\n", cuda_err);
exit(1);
}
printf("after cudaMalloc()\n");
cudaMalloc((void**)&d_dataB, sizeof(int) *Row*Col);
cudaMalloc((void**)&d_dataC, sizeof(int) *Row*Col);
//set value
for (int i = 0; i < Row*Col; i++) {
A[i] = 90;
B[i] = 10;
}
cudaMemcpy(d_dataA, A, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataB, B, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
dim3 threadPerBlock(16, 16);
dim3 blockNumber((Col+threadPerBlock.x-1)/ threadPerBlock.x, (Row+threadPerBlock.y-1)/ threadPerBlock.y );
matrix_mul_gpu << <blockNumber, threadPerBlock >> > (d_dataA, d_dataB, d_dataC, Col);
cudaDeviceSynchronize();
cudaMemcpy(C, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);
free(A);
free(B);
free(C);
cudaFree(d_dataA);
cudaFree(d_dataB);
cudaFree(d_dataC);
return 0;
}
nvcc matri.cu -o matri.out
./matri.out
func start
before cudaMalloc()
hijacking cuMemAlloc!
(matri.cu:main:40)invalid device context
cuda_err is 201
我也遇到了同样的问题。 然后我尝试挂钩 cuMemAlloc_v2 而不是 cuMemAlloc,它似乎有效。 但我不知道为什么它有效。 哈哈