因此,我必须使用 CUDA 的共享内存来分析此矩阵向量乘法,但是,我在 Visual Studio 2022 上执行代码时遇到问题,我收到错误“没有重载函数“atomicAdd”的实例对参数进行数学运算”清单”。
关于它的含义以及如何解决它有什么想法吗?我将非常感激。
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cuda.h>
#include "cuda_runtime.h"
#include <iostream>
#define TILE_SIZE 16
#define BLOCK_SIZE 256
__global__ void matvec(double* A, double* B, double* C, int n)
{
__shared__ double s_A[TILE_SIZE][TILE_SIZE];
__shared__ double s_B[TILE_SIZE];
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = bx * blockDim.x + tx;
if (i < n) {
s_B[tx] = B[i];
for (int j = 0; j < n; j += TILE_SIZE) {
s_A[tx][j + threadIdx.y] = A[(i * n) + j + threadIdx.y];
}
}
__syncthreads();
if (i < n) {
double tmp = 0.0;
for (int j = 0; j < n; j += TILE_SIZE) {
tmp += s_A[threadIdx.x][j + threadIdx.y] * s_B[j + threadIdx.y];
}
atomicAdd(&C[i], tmp);
}
}
int main()
{
int n = 5000;
double* A, * B, * C;
double* d_A, * d_B, * d_C;
A = (double*)malloc(n * n * sizeof(double));
B = (double*)malloc(n * sizeof(double));
C = (double*)malloc(n * sizeof(double));
for (int i = 0; i < n * n; i++) {
A[i] = rand() / (double)RAND_MAX;
}
for (int i = 0; i < n; i++) {
B[i] = rand() / (double)RAND_MAX;
C[i] = 0.0;
}
cudaMalloc((void**)&d_A, n * n * sizeof(double));
cudaMalloc((void**)&d_B, n * sizeof(double));
cudaMalloc((void**)&d_C, n * sizeof(double));
cudaMemcpy(d_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, n * sizeof(double), cudaMemcpyHostToDevice);
dim3 dimBlock(TILE_SIZE, BLOCK_SIZE / TILE_SIZE, 1);
dim3 dimGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1);
matvec << <dimGrid, dimBlock >> > (d_A, d_B, d_C, n);
cudaMemcpy(C, d_C, n * sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(A);
free(B);
free(C);
return 0;
}
atomicAdd
double
在计算能力 5.0 的设备上不可用。
如果您有 6.x (Pascal) 或更高版本的设备,您需要告诉编译器针对特定架构进行编译,因为默认目标架构可能仍低于 6.0,因此编译器不会看到定义。最简单的方法是添加例如
-arch=sm_60
至 nvcc
命令。
如果您在旧设备上需要此功能,编程指南展示了如何使用atomicCAS自行实现。
#if __CUDA_ARCH__ < 600
__device__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
#endif