matlab mex文件中的矩阵乘法代码与CUDA代码,但结果是0。有人知道原因吗?

问题描述 投票:0回答:1

我用cuda代码在matlab mex文件中编写了一个代码。该函数是计算矩阵-矩阵乘法,当我输入两个矩阵,大小分别为MN和NK时,结果都是0(M*K) ,但我不知道原因。有人可以帮助我吗? 我使用Matlab2022b、Visual Studio2019和cuda 11.2。 代码如下。 通过

system('nvcc -c naiveMatMul.cu -ccbin "E:\Program Files (x86)\vs2019\VC\Tools\MSVC\14.29.30133\bin')
mex naiveMatMulCuda.cpp naiveMatMul.obj -lcudart -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\lib\x64"

编译代码
/*naiveMatMul.h*/
#ifndef __NAIVEMATMUL_H__
#define __NAIVEMATMUL_H__
extern void naiveMatMulCu(float* A,float* B,float* C,int M,int N,int K);
#endif
/*naiveMatMul.cu*/
#include"naiveMatMul.h"
#define BLOCK_SIZE 16
//#include"mex.h"
// kernel compute a M*N matrix multiplys a N*K matrix
__global__ void naiveMatMul(float *A,float *B, float *C, int M, int N, int K)
{ 
    int row = blockIdx.y * blockDim.y + threadIdx.y; 
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int sum = 0;
    if( col < K && row < M) 
    {
        for(int i = 0; i <M; i++) 
        {
            sum += A[row * N + i] * B[i * K + col];
        }
        C[row * K + col] = sum;
    }
    return;
} 
void naiveMatMulCu(float* A,float* B,float* C,int M,int N,int K){
//void naiveMatMulCu(float* A,float* B,float* C,int n){
    float *deviceA ,*deviceB ,*deviceC ;
    // malloc memory
    cudaMalloc(&deviceA,sizeof(float)*M*N);
    cudaMalloc(&deviceB,sizeof(float)*K*N);
    cudaMalloc(&deviceC,sizeof(float)*M*K);
    // CPU(A,B) --> GPU(A,B)
    cudaMemcpy(deviceA,A,sizeof(float)*M*N,cudaMemcpyHostToDevice);
    cudaMemcpy(deviceB,B,sizeof(float)*N*K,cudaMemcpyHostToDevice);
    // compute
    unsigned int grid_rows = (M + BLOCK_SIZE - 1) / BLOCK_SIZE;
    unsigned int grid_cols = (K + BLOCK_SIZE - 1) / BLOCK_SIZE;
    dim3 dimGrid(grid_cols, grid_rows);
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
    naiveMatMul<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, M, N, K); 
    // GPU(C)-->CPU(C)
    cudaMemcpy(C,deviceC,sizeof(float)*M*K,cudaMemcpyDeviceToHost);
    // free memory
    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);

}
/*naiveMatMulCuda.cpp*/
#include"mex.h"
#include"naiveMatMul.h"
void mexFunction(int nlhs,mxArray *plhs[],int nrhs,mxArray *prhs[]){
    // error detection
    if(nrhs != 2)
        mexErrMsgTxt("Invalid number of inputs");
    if(nlhs != 1)
        mexErrMsgTxt("Invalid number of outputs");
    if(!mxIsSingle(prhs[0]) ||!mxIsSingle(prhs[1]))
        mexErrMsgTxt("Input data must be single");
    // matrix size
    int numRowsA = (int)mxGetM(prhs[0]);
    int numColsA = (int)mxGetN(prhs[0]);
    int numRowsB = (int)mxGetM(prhs[1]);
    int numColsB = (int)mxGetN(prhs[1]);
    if( numColsA != numRowsB ) 
        mexErrMsgTxt("Input data size is wrong");
    // get A,B data
    float *A = (float*)mxGetData(prhs[0]);
    float *B = (float*)mxGetData(prhs[1]);
    plhs[0] = mxCreateNumericMatrix(numRowsA,
                                    numColsB,
                                    mxSINGLE_CLASS,
                                    mxREAL);
    float* C = (float*)mxGetData(plhs[0]);
    //compute
    naiveMatMulCu(A,B,C, numRowsA,numColsA,numColsB);
}

我想知道为什么输出是0,并得到正确的代码。

matlab cuda mex matlab-compiler cuda-gdb
1个回答
0
投票

请阅读文档

mxGetData
:

使用

mxGetData
仅获取非数字数组的数据元素。

您应该使用 use

mxGetSingles
来代替。

© www.soinside.com 2019 - 2024. All rights reserved.