让我们考虑一下CUDA's Mersenne Twister for an arbitrary number of threads上的CUDA代码,并假设我想将它转换为pyCUDA
应用程序。
我知道我可以使用ctypes
和CDLL
,即
cudart = CDLL("/usr/local/cuda/lib64/libcudart.so")
使用cudart
例程。
但是,我还需要分配一个curandStateMtgp32
数组,其定义在curand_mtgp32.h
中,或者调用
curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, devKernelParams);
并使用mtgp32dc_params_fast_11213
,其定义在curand_mtgp32_host.h
。
如何处理CUDA
中的pyCUDA
类型定义和值?
我参考设备端API解决了这个问题,如下所示:
.dll
:用于设置Mersenne Twister Generator的MTGP32Setup()
和用于生成随机数的MTGP32Generation()
;ctypes
调用了上面的函数。.dll
的源代码
// --- Generate random numbers with cuRAND's Mersenne Twister
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <cuda.h>
#include <curand_kernel.h>
/* include MTGP host helper functions */
#include <curand_mtgp32_host.h>
#define BLOCKSIZE 256
#define GRIDSIZE 64
curandStateMtgp32 *devMTGPStates;
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/*************************/
/* CURAND ERROR CHECKING */
/*************************/
static const char *_curandGetErrorEnum(curandStatus_t error)
{
switch (error)
{
case CURAND_STATUS_SUCCESS:
return "CURAND_SUCCESS";
case CURAND_STATUS_VERSION_MISMATCH:
return "CURAND_STATUS_VERSION_MISMATCH";
case CURAND_STATUS_NOT_INITIALIZED:
return "CURAND_STATUS_NOT_INITIALIZED";
case CURAND_STATUS_ALLOCATION_FAILED:
return "CURAND_STATUS_ALLOCATION_FAILED";
case CURAND_STATUS_TYPE_ERROR:
return "CURAND_STATUS_TYPE_ERROR";
case CURAND_STATUS_OUT_OF_RANGE:
return "CURAND_STATUS_OUT_OF_RANGE";
case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
case CURAND_STATUS_LAUNCH_FAILURE:
return "CURAND_STATUS_LAUNCH_FAILURE";
case CURAND_STATUS_PREEXISTING_FAILURE:
return "CURAND_STATUS_PREEXISTING_FAILURE";
case CURAND_STATUS_INITIALIZATION_FAILED:
return "CURAND_STATUS_INITIALIZATION_FAILED";
case CURAND_STATUS_ARCH_MISMATCH:
return "CURAND_STATUS_ARCH_MISMATCH";
case CURAND_STATUS_INTERNAL_ERROR:
return "CURAND_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
inline void __curandSafeCall(curandStatus_t err, const char *file, const int line)
{
if (CURAND_STATUS_SUCCESS != err) {
fprintf(stderr, "CURAND error in file '%s', line %d, error: %s \nterminating!\n", __FILE__, __LINE__, \
_curandGetErrorEnum(err)); \
assert(0); \
}
}
void curandSafeCall(curandStatus_t err) { __curandSafeCall(err, __FILE__, __LINE__); }
/*******************/
/* iDivUp FUNCTION */
/*******************/
__host__ __device__ int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*********************/
/* GENERATION KERNEL */
/*********************/
__global__ void generate_kernel(curandStateMtgp32 * __restrict__ state, float * __restrict__ result, const int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int k = tid; k < N; k += blockDim.x * gridDim.x)
result[k] = curand_uniform(&state[blockIdx.x]);
}
extern "C" {
/**************************/
/* MERSENNE TWISTER SETUP */
/**************************/
__declspec(dllexport)
void MTGP32Setup() {
// --- Setup the pseudorandom number generator
gpuErrchk(cudaMalloc(&devMTGPStates, GRIDSIZE * sizeof(curandStateMtgp32)));
mtgp32_kernel_params *devKernelParams; gpuErrchk(cudaMalloc(&devKernelParams, sizeof(mtgp32_kernel_params)));
curandSafeCall(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, devKernelParams));
curandSafeCall(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, GRIDSIZE, time(NULL)));
}
/*******************************/
/* MERSENNE TWISTER GENERATION */
/*******************************/
__declspec(dllexport)
void MTGP32Generation(float * __restrict__ devResults, const int N) {
// --- Generate pseudo-random sequence and copy to the host
generate_kernel << <GRIDSIZE, BLOCKSIZE >> > (devMTGPStates, devResults, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
} //
PyCUDA
来电的源代码
import os
import sys
import numpy as np
import ctypes
from ctypes import *
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
lib = cdll.LoadLibrary('D:\\Project\\cuRAND\\mersenneTwisterDLL\\x64\\Release\\mersenneTwisterDLL.dll')
N = 10
d_x = gpuarray.zeros((N, 1), dtype = np.float32)
lib.MTGP32Setup()
lib.MTGP32Generation(ctypes.cast(d_x.ptr, POINTER(c_float)), N)
print(d_x)
主机端API可以用类似于Calling host functions in PyCUDA的方式处理。