我用 200 万个浮点数(下面的片段)做了一个简单的测试。
cudaMalloc
比映射 cudaHostAlloc
快大约 3 倍。当我超过 200 万时,差异会增加,速度会快 10-20 倍。为什么会这样?
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
float* data_h;
cudaHostAlloc((float**)&data_h, sizeof(float) * COUNT, cudaHostAllocMapped);
float* data_h_out;
cudaHostAlloc((float**)&data_h_out, sizeof(float) * COUNT, cudaHostAllocMapped);
float* data_d;
cudaHostGetDevicePointer((float**)&data_d, (float*)data_h, 0);
float* data_d_out;
cudaHostGetDevicePointer((float**)&data_d_out, (float*)data_h_out, 0);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("milliseconds allocate = %f\n", milliseconds);
对
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
float* data_d;
cudaMalloc(&data_d, sizeof(float) * COUNT);
float* data_d_out;
cudaMalloc(&data_d_out, sizeof(float) * COUNT);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("milliseconds allocate = %f\n", milliseconds);