atomicAdd() 显示错误:没有重载函数“atomicAdd”的实例对参数列表进行数学计算

问题描述 投票:0回答:1

因此,我必须使用 CUDA 的共享内存来分析此矩阵向量乘法,但是,我在 Visual Studio 2022 上执行代码时遇到问题,我收到错误“没有重载函数“atomicAdd”的实例对参数进行数学运算”清单”。

关于它的含义以及如何解决它有什么想法吗?我将非常感激。

#ifndef __CUDACC__  
#define __CUDACC__
#endif
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cuda.h>
#include "cuda_runtime.h"
#include <iostream>

#define TILE_SIZE 16
#define BLOCK_SIZE 256

__global__ void matvec(double* A, double* B, double* C, int n)
{
    __shared__ double s_A[TILE_SIZE][TILE_SIZE];
    __shared__ double s_B[TILE_SIZE];

    int bx = blockIdx.x;
    int tx = threadIdx.x;
    int i = bx * blockDim.x + tx;

    if (i < n) {
        s_B[tx] = B[i];
        for (int j = 0; j < n; j += TILE_SIZE) {
            s_A[tx][j + threadIdx.y] = A[(i * n) + j + threadIdx.y];
        }
    }
    __syncthreads();

    if (i < n) {
        double tmp = 0.0;
        for (int j = 0; j < n; j += TILE_SIZE) {
            tmp += s_A[threadIdx.x][j + threadIdx.y] * s_B[j + threadIdx.y];
        }
        atomicAdd(&C[i], tmp);
    }
}

int main()
{
    int n = 5000;
    double* A, * B, * C;
    double* d_A, * d_B, * d_C;

    A = (double*)malloc(n * n * sizeof(double));
    B = (double*)malloc(n * sizeof(double));
    C = (double*)malloc(n * sizeof(double));

    for (int i = 0; i < n * n; i++) {
        A[i] = rand() / (double)RAND_MAX;
    }
    for (int i = 0; i < n; i++) {
        B[i] = rand() / (double)RAND_MAX;
        C[i] = 0.0;
    }

    cudaMalloc((void**)&d_A, n * n * sizeof(double));
    cudaMalloc((void**)&d_B, n * sizeof(double));
    cudaMalloc((void**)&d_C, n * sizeof(double));

    cudaMemcpy(d_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, n * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, C, n * sizeof(double), cudaMemcpyHostToDevice);

    dim3 dimBlock(TILE_SIZE, BLOCK_SIZE / TILE_SIZE, 1);
    dim3 dimGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1);

    matvec << <dimGrid, dimBlock >> > (d_A, d_B, d_C, n);

    cudaMemcpy(C, d_C, n * sizeof(double), cudaMemcpyDeviceToHost);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(A);
    free(B);
    free(C);

    return 0;
}

c++ matrix vector cuda matrix-multiplication
1个回答
1
投票

atomicAdd
double
在计算能力 5.0 的设备上不可用。

如果您有 6.x (Pascal) 或更高版本的设备,您需要告诉编译器针对特定架构进行编译,因为默认目标架构可能仍低于 6.0,因此编译器不会看到定义。最简单的方法是添加例如

-arch=sm_60
nvcc
命令。

如果您在旧设备上需要此功能,编程指南展示了如何使用atomicCAS自行实现。

#if __CUDA_ARCH__ < 600
__device__ double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull =
                              (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;

    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                               __longlong_as_double(assumed)));

    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
    } while (assumed != old);

    return __longlong_as_double(old);
}
#endif
© www.soinside.com 2019 - 2024. All rights reserved.