CUDA 内核在 cudaMallocManaged 内存上运行时速度慢 10 倍，即使预取也是如此

Question

#include <cuda_runtime.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;

class MyTimer {
    std::chrono::time_point<std::chrono::system_clock> start;

public:
    void startCounter() {
        start = std::chrono::system_clock::now();
    }

    int64_t getCounterNs() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
    }

    int64_t getCounterMs() {
        return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
    }

    double getCounterMsPrecise() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
                / 1000000.0;
    }
};

__global__
void HelloWorld()
{
  printf("Hello world\n");
}

volatile double dummy = 0;

__global__
void multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y)
{
  int start = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for (int i = start; i < N; i += stride) {
    output[i] = x[i] * y[i];
  }
}


int main()
{
  MyTimer timer;
  srand(time(NULL));
  HelloWorld<<<1,1>>>();

  timer.startCounter();
  int N = 2000 * 2000;
  float* h_a = new float[N];
  float* h_b = new float[N];
  float* h_c = new float[N];
  float* h_res = new float[N];
  for (int i = 0; i < N; i++) {
    h_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    h_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    h_c[i] = h_a[i] * h_b[i];
  }
  dummy = timer.getCounterMsPrecise();

  timer.startCounter();
  float *d_a, *d_b, *d_c;
  cudaMalloc(&d_a, N * sizeof(float));
  cudaMalloc(&d_b, N * sizeof(float));
  cudaMalloc(&d_c, N * sizeof(float));
  dummy = timer.getCounterMsPrecise();
  cout << "cudaMalloc cost = " << dummy << "\n";

  timer.startCounter();
  cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);  
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "H2D copy cost = " << dummy << "\n";
  
  timer.startCounter();
  constexpr int GRID_DIM = 256;
  constexpr int BLOCK_DIM = 256;
  multiply<<<GRID_DIM, BLOCK_DIM>>>(N, d_c, d_a, d_b);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "kernel cost = " << dummy << "\n";

  timer.startCounter();
  cudaMemcpy(h_res, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "D2H copy cost = " << timer.getCounterMsPrecise() << "\n";

  for (int i = 0; i < N; i++) if (h_res[i] != h_c[i]) {
    cout << "error\n";
    exit(1);
  }

  return 0;
}

如果我用普通的

cudaMalloc

，结果是

Hello world
cudaMalloc cost = 0.599463
H2D copy cost = 5.16785
kernel cost = 0.109068
D2H copy cost = 7.18768

但是如果我使用

cudaMallocManaged

，它就变成了

Hello world
cudaMalloc cost = 0.116722
H2D copy cost = 8.26673
kernel cost = 1.70356
D2H copy cost = 6.8841

为什么会有这么大的性能下降？代码已经手动将内存复制到设备端，所以它不应该与常规

cudaMalloc-ed

设备内存完全相同吗？

Answer 1

使用托管内存时，“预取”并不意味着使用

cudaMemcpy

。我不建议将

cudaMemcpy

与托管内存一起使用。你不会找到任何建议的培训材料，而且它不一定会按照你的想法去做。

要在按需分页托管内存（也称为统一内存，或 UM）机制中预取数据，您实际上应该使用

cudaMemPrefetchAsync

。当我这样做时，我观察到这两种情况在性能上没有显着差异。为了进行明智的比较，我不得不对您的代码进行一些重构：

$ cat t2230.cu
#include <cuda_runtime.h>
#include <string>
#include <chrono>
#include <random>
#include <iostream>
using namespace std;

class MyTimer {
    std::chrono::time_point<std::chrono::system_clock> start;

public:
    void startCounter() {
        start = std::chrono::system_clock::now();
    }

    int64_t getCounterNs() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
    }

    int64_t getCounterMs() {
        return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
    }

    double getCounterMsPrecise() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
                / 1000000.0;
    }
};

__global__
void HelloWorld()
{
  printf("Hello world\n");
}

volatile double dummy = 0;

__global__
void multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y)
{
  int start = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for (int i = start; i < N; i += stride) {
    output[i] = x[i] * y[i];
  }
}


int main()
{
  MyTimer timer;
  srand(time(NULL));
  HelloWorld<<<1,1>>>();
  int N = 2000 * 2000;
  timer.startCounter();
  float *d_a, *d_b, *d_c;
#ifdef USE_MANAGED
  cudaMallocManaged(&d_a, N * sizeof(float));
  cudaMallocManaged(&d_b, N * sizeof(float));
  cudaMallocManaged(&d_c, N * sizeof(float));
  for (int i = 0; i < N; i++) {
    d_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    d_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    d_c[i] = 0.f;
  }
  cudaMemPrefetchAsync(d_a, N*sizeof(float), 0);
  cudaMemPrefetchAsync(d_b, N*sizeof(float), 0);
  cudaMemPrefetchAsync(d_c, N*sizeof(float), 0);
#else
  float* h_a = new float[N];
  float* h_b = new float[N];
  float* h_res = new float[N];
  for (int i = 0; i < N; i++) {
    h_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    h_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
  }
  cudaMalloc(&d_a, N * sizeof(float));
  cudaMalloc(&d_b, N * sizeof(float));
  cudaMalloc(&d_c, N * sizeof(float));
  cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);
#endif
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "alloc/H2D cost = " << dummy << "\n";
  constexpr int GRID_DIM = 80;
  constexpr int BLOCK_DIM = 1024;

  timer.startCounter();
  multiply<<<GRID_DIM, BLOCK_DIM>>>(N, d_c, d_a, d_b);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "kernel cost = " << dummy << "\n";
  float *res = d_c;
  float *a = d_a;
  float *b = d_b;
#ifndef USE_MANAGED
  timer.startCounter();
  cudaMemcpy(h_res, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "D2H copy cost = " << timer.getCounterMsPrecise() << "\n";
  res = h_res;
  a = h_a;
  b = h_b;
#endif

  for (int i = 0; i < N; i++) if (res[i] != (a[i]*b[i])) {
    cout << "error\n";
    exit(1);
  }
  return 0;
}
$ nvcc -o t2230 t2230.cu
$ CUDA_VISIBLE_DEVICES="0" ./t2230
Hello world
alloc/H2D cost = 453.012
kernel cost = 0.109507
D2H copy cost = 8.04054
$ nvcc -o t2230 t2230.cu -DUSE_MANAGED
$ CUDA_VISIBLE_DEVICES="0" ./t2230
Hello world
alloc/H2D cost = 411.502
kernel cost = 0.101654
$

（V100，CUDA 11.4）

请注意，这假设您处于请求分页的 UM 制度中。如果您不在按需分页机制中（例如，目前在 Maxwell 或 Kepler 设备上，或在 Windows 上，或在 Jetson 上），那么您将不会使用

cudaMemPrefetchAsync

，并且数据迁移与内核启动密不可分.还要注意

CUDA_VISIBLE_DEVICES

的使用。在多 GPU 系统中，UM 可以具有多种不同的行为，具体取决于系统拓扑以及系统中的 GPU。这会使同类比较变得困难。

最后，我没有将数据预取回主机，如果你想比较那个活动，你已经得到了一些instruction.

Answer 2

使用托管内存时，cpu和gpu之间有一个底层的交换机制。尤其是第一次运行内核时。如果多次运行内核，执行时间将恢复正常。

CUDA 内核在 cudaMallocManaged 内存上运行时速度慢 10 倍，即使预取也是如此

问题描述投票：0回答：2

2个回答

最新问题

CUDA 内核在 cudaMallocManaged 内存上运行时速度慢 10 倍，即使预取也是如此

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2