为什么 cudaHostAlloc 比 cudaMalloc 慢得多?

问题描述 投票:0回答:0

我用 200 万个浮点数(下面的片段)做了一个简单的测试。

cudaMalloc
比映射
cudaHostAlloc
快大约 3 倍。当我超过 200 万时,差异会增加,速度会快 10-20 倍。为什么会这样?

    cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    
        cudaEventRecord(start);
    
        float* data_h;
        cudaHostAlloc((float**)&data_h, sizeof(float) * COUNT, cudaHostAllocMapped);
    
        float* data_h_out;
        cudaHostAlloc((float**)&data_h_out, sizeof(float) * COUNT, cudaHostAllocMapped);
    
        float* data_d;
        cudaHostGetDevicePointer((float**)&data_d, (float*)data_h, 0);
    
        float* data_d_out;
        cudaHostGetDevicePointer((float**)&data_d_out, (float*)data_h_out, 0);
    
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
    
        float milliseconds = 0;
        cudaEventElapsedTime(&milliseconds, start, stop);
        printf("milliseconds allocate = %f\n", milliseconds);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    float* data_d;
    cudaMalloc(&data_d, sizeof(float) * COUNT);

    float* data_d_out;
    cudaMalloc(&data_d_out, sizeof(float) * COUNT);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("milliseconds allocate = %f\n", milliseconds);
c++ cuda allocation
© www.soinside.com 2019 - 2024. All rights reserved.