我用cuda代码在matlab mex文件中编写了一个代码。该函数是计算矩阵-矩阵乘法,当我输入两个矩阵,大小分别为MN和NK时,结果都是0(M*K) ,但我不知道原因。有人可以帮助我吗? 我使用Matlab2022b、Visual Studio2019和cuda 11.2。 代码如下。 通过
system('nvcc -c naiveMatMul.cu -ccbin "E:\Program Files (x86)\vs2019\VC\Tools\MSVC\14.29.30133\bin')
和mex naiveMatMulCuda.cpp naiveMatMul.obj -lcudart -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\lib\x64"
编译代码
/*naiveMatMul.h*/
#ifndef __NAIVEMATMUL_H__
#define __NAIVEMATMUL_H__
extern void naiveMatMulCu(float* A,float* B,float* C,int M,int N,int K);
#endif
/*naiveMatMul.cu*/
#include"naiveMatMul.h"
#define BLOCK_SIZE 16
//#include"mex.h"
// kernel compute a M*N matrix multiplys a N*K matrix
__global__ void naiveMatMul(float *A,float *B, float *C, int M, int N, int K)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if( col < K && row < M)
{
for(int i = 0; i <M; i++)
{
sum += A[row * N + i] * B[i * K + col];
}
C[row * K + col] = sum;
}
return;
}
void naiveMatMulCu(float* A,float* B,float* C,int M,int N,int K){
//void naiveMatMulCu(float* A,float* B,float* C,int n){
float *deviceA ,*deviceB ,*deviceC ;
// malloc memory
cudaMalloc(&deviceA,sizeof(float)*M*N);
cudaMalloc(&deviceB,sizeof(float)*K*N);
cudaMalloc(&deviceC,sizeof(float)*M*K);
// CPU(A,B) --> GPU(A,B)
cudaMemcpy(deviceA,A,sizeof(float)*M*N,cudaMemcpyHostToDevice);
cudaMemcpy(deviceB,B,sizeof(float)*N*K,cudaMemcpyHostToDevice);
// compute
unsigned int grid_rows = (M + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (K + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
naiveMatMul<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, M, N, K);
// GPU(C)-->CPU(C)
cudaMemcpy(C,deviceC,sizeof(float)*M*K,cudaMemcpyDeviceToHost);
// free memory
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
}
/*naiveMatMulCuda.cpp*/
#include"mex.h"
#include"naiveMatMul.h"
void mexFunction(int nlhs,mxArray *plhs[],int nrhs,mxArray *prhs[]){
// error detection
if(nrhs != 2)
mexErrMsgTxt("Invalid number of inputs");
if(nlhs != 1)
mexErrMsgTxt("Invalid number of outputs");
if(!mxIsSingle(prhs[0]) ||!mxIsSingle(prhs[1]))
mexErrMsgTxt("Input data must be single");
// matrix size
int numRowsA = (int)mxGetM(prhs[0]);
int numColsA = (int)mxGetN(prhs[0]);
int numRowsB = (int)mxGetM(prhs[1]);
int numColsB = (int)mxGetN(prhs[1]);
if( numColsA != numRowsB )
mexErrMsgTxt("Input data size is wrong");
// get A,B data
float *A = (float*)mxGetData(prhs[0]);
float *B = (float*)mxGetData(prhs[1]);
plhs[0] = mxCreateNumericMatrix(numRowsA,
numColsB,
mxSINGLE_CLASS,
mxREAL);
float* C = (float*)mxGetData(plhs[0]);
//compute
naiveMatMulCu(A,B,C, numRowsA,numColsA,numColsB);
}
我想知道为什么输出是0,并得到正确的代码。