CudaMallocManaged 是否在设备上分配内存？

Question

我使用统一内存来简化对 CPU 和 GPU 上数据的访问。据我所知，cudaMallocManaged 应该在设备上分配内存。我写了一个简单的代码来检查：

#define TYPE float
#define BDIMX 16
#define BDIMY 16
#include <cuda.h>
#include <cstdio>
#include <iostream>
__global__ void kernel(TYPE *g_output, TYPE *g_input, const int dimx, const int dimy)
{
__shared__ float s_data[BDIMY][BDIMX];
  int ix = blockIdx.x * blockDim.x + threadIdx.x;
  int iy = blockIdx.y * blockDim.y + threadIdx.y;
  int in_idx = iy * dimx + ix; // index for reading input
  int tx = threadIdx.x; // thread’s x-index into corresponding shared memory tile  
  int ty = threadIdx.y; // thread’s y-index into corresponding shared memory tile 
  s_data[ty][tx] = g_input[in_idx];
  __syncthreads();
  g_output[in_idx] = s_data[ty][tx] * 1.3;
  }


int main(){
  int size_x = 16, size_y = 16;
  dim3 numTB;
    numTB.x = (int)ceil((double)(size_x)/(double)BDIMX) ;
    numTB.y = (int)ceil((double)(size_y)/(double)BDIMY) ;
  dim3 tbSize; 
  tbSize.x = BDIMX;
  tbSize.y = BDIMY;
  float* a,* a_out;
  cudaMallocManaged((void**)&a,     size_x * size_y * sizeof(TYPE));
  cudaMallocManaged((void**)&a_out, size_x * size_y * sizeof(TYPE));

  kernel <<<numTB, tbSize>>>(a_out, a, size_x, size_y);
    cudaDeviceSynchronize();
  return 0;
}

所以我什至没有访问 CPU 上的数据以避免任何页面错误，因此内存应该位于设备内存上。但是，当我在此代码上运行 nvprof 时，我得到以下结果：

  invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "Tesla K40c (0)"
Kernel: kernel(float*, float*, int, int)
        1                   local_load_transactions                   Local Load Transactions           0           0           0
        1                  local_store_transactions                  Local Store Transactions           0           0           0
        1                  shared_load_transactions                  Shared Load Transactions           8           8           8
        1                 shared_store_transactions                 Shared Store Transactions           8           8           8
        1                          gld_transactions                  Global Load Transactions           8           8           8
        1                          gst_transactions                 Global Store Transactions           8           8           8
        1                  sysmem_read_transactions           System Memory Read Transactions          32          32          32
        1                 sysmem_write_transactions          System Memory Write Transactions          34          34          34
        1                    tex_cache_transactions                Texture Cache Transactions           0           0           0
        1                    dram_read_transactions           Device Memory Read Transactions           0           0           0
        1                   dram_write_transactions          Device Memory Write Transactions           0           0           0

显然数组是在系统内存而不是设备内存上分配的。我在这里缺少什么？

Answer 1

托管内存确实在 GPU 上分配物理内存。您可以通过对代码执行以下操作来确认是否属于这种情况：

#include <iostream>

void report_gpu_mem()
{
    size_t free, total;
    cudaMemGetInfo(&free, &total);
    std::cout << "Free = " << free << " Total = " << total <<std::endl;
}

int main()
{
    float* a,* a_out;
    size_t sz = 1 << 24; // 16Mb
    report_gpu_mem();
    cudaMallocManaged((void**)&a, sz);
    report_gpu_mem();
    cudaMallocManaged((void**)&a_out, sz);
    report_gpu_mem();
    cudaFree(a);
    report_gpu_mem();
    cudaFree(a_out);
    report_gpu_mem();
    return cudaDeviceReset();
}

现在为两个托管分配中的每一个分配 16Mb，然后释放它们。不会发生主机或设备访问，因此不应触发传输或同步。该大小足够大，应该超过 GPU 内存管理器的最小粒度并触发可见可用内存的变化。编译并运行它会执行以下操作：

$ nvcc -arch=sm_52 sleepy.cu 
$ CUDA_VISIBLE_DEVICES="0" ./a.out 
Free = 4211929088 Total = 4294770688
Free = 4194869248 Total = 4294770688
Free = 4178092032 Total = 4294770688
Free = 4194869248 Total = 4294770688
Free = 4211654656 Total = 4294770688

GPU 上的物理可用内存在每次分配/释放时明显增加和减少 16Mb。

Answer 2

新的 cuda 版本中发生了变化。在我的 RTX 3070 和 cuda 12.3 中，代码结果为

Free = 8173191168 Total = 8361017344
Free = 8173191168 Total = 8361017344
Free = 8173191168 Total = 8361017344
Free = 8173191168 Total = 8361017344
Free = 8173191168 Total = 8361017344

如果我通过简单的分配来使用分配的内存，

#include <iostream>

void report_gpu_mem()
{
    size_t free, total;
    cudaMemGetInfo(&free, &total);
    std::cout << "Free = " << free << " Total = " << total <<std::endl;
}

__global__ void usememory(float* a) {
    a[0] = 1 ;
    return;
}


int main()
{
    float* a,* a_out;
    size_t sz = 1 << 24; // 16Mb
    report_gpu_mem();
    cudaMallocManaged((void**)&a, sz);
    
    usememory<<<1, 1>>>(a);
    report_gpu_mem();
    cudaMallocManaged((void**)&a_out, sz);
    
    
    report_gpu_mem();
    cudaFree(a);
    report_gpu_mem();
    cudaFree(a_out);
    report_gpu_mem();
    return cudaDeviceReset();
}

我得到了

Free = 8173191168 Total = 8361017344
Free = 8173191168 Total = 8361017344
Free = 8173191168 Total = 8361017344
Free = 8041070592 Total = 8361017344
Free = 8041070592 Total = 8361017344

好像有lazy copy之类的东西？而且我不知道为什么当数组被释放时gpu内存会变少。

CudaMallocManaged 是否在设备上分配内存？

问题描述投票：0回答：2

2个回答

最新问题

CudaMallocManaged 是否在设备上分配内存？

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2