在CPU或GPU上执行每个块的部分结果的最终减少的CUDA有什么区别?

问题描述 投票:-1回答:1

刚开始在CUDA中学习时,我发现在线代码指定最终结果在CPU而不是GPU上运行。但是,经过几次浏览后,我不知道要这样做的代码行。在GPU中运行的代码是什么样的?任何人都可以指出或详细说明吗?

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <numeric>
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>

#define BLOCK_SIZE 32
#define N 512

__global__ void reduceKernel(float* d_out, float* d_in);

int main(void) {
    size_t size = N * sizeof(float);
    size_t size_o = size / BLOCK_SIZE;

    float h_in[N];
    float h_out[N / BLOCK_SIZE];

    float* d_in, * d_out;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaError_t err;

    for (int i = 0; i < N; i++) {
        h_in[i] = 1.0f;
    }

    cudaMalloc((void**)&d_in, size);
    cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);

    cudaMalloc((void**)&d_out, size_o);

    int grid_size = N / BLOCK_SIZE;
    printf("Grid Size is: %d\n", grid_size);
    printf("Block Size is: %d\n", BLOCK_SIZE);

    dim3 threadsPerBlock(BLOCK_SIZE);
    dim3 blocks(grid_size);

    cudaEventRecord(start);
    reduceKernel << <blocks, threadsPerBlock >> > (d_out, d_in);
    //Wait for GPU to finish before accessing on hostf
    err = cudaDeviceSynchronize();
    cudaEventRecord(stop);
    printf("Run Kernel: %s\n", cudaGetErrorString(err));

    printf("Input Array: \n");
    for (int n = 0; n < N; n++) {
        printf("%f ", h_in[n]);
    }
    printf("\n");

    err = cudaMemcpy(h_out, d_out, size_o, cudaMemcpyDeviceToHost);
    printf("Copy h_out off device: %s\n", cudaGetErrorString(err));
    float milliseconds = 0;   //Cuda Timing Event
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Elapsed time was: %f milliseconds \n", milliseconds);

    for (int n = 0; n < grid_size; n++) {
        printf("%f ", h_out[n]);
    }
    printf("\n");

    float final_reduction = 0.0f;
    for (int i = 0; i < grid_size; i++) {
        final_reduction += h_out[i];
    }
    printf("And the final reduction is: %f\n", final_reduction);

    cudaFree(d_in);
    cudaFree(d_out);
}

__global__ void reduceKernel(float* d_out, float* d_in) {
    int myId = threadIdx.x + blockDim.x * blockIdx.x; //ID relative to whole array 
    int tid = threadIdx.x;    //Local ID within the current block

    __shared__ float temp[BLOCK_SIZE];
    temp[tid] = d_in[myId];
    __syncthreads();

    //do reduction in shared memory
    for (unsigned int s = blockDim.x / 2; s >= 1; s >>= 1) {
        if (tid < s) {
            temp[tid] += temp[tid + s];
        }
        __syncthreads();   //make sure all adds at one stage are done
    }

    //only thread 0 writes result for this block back to global memory
    if (tid == 0) {
        d_out[blockIdx.x] = temp[tid];
     }
}
cuda reduction
1个回答
0
投票

考虑阅读代码:for(int i = 0; i

© www.soinside.com 2019 - 2024. All rights reserved.