为什么cuda_free有这么高的开销? (是归零引起的吗?)

问题描述 投票:0回答:1

我观察到cudaFree操作需要花费长时间才能接受。为了验证这一点,我编写了一个简单的微基准测试代码来测试cudaFree开销。它显示出类似的趋势。

第一步:cudaMallocManaged不需要那么长时间。实际上,仅对于cudaMallocManaged,它小于1ms。

第二步:init要初始化分配的内存区域,将调用init函数。而且它也不会少于1毫秒。

第三步:cudaFree现在,出现开销。 需要很多时间。还有一件事情是更大的内存大小,需要更长的时间。(漂亮地线性排列)

[[问题是“为什么cudaFree会有这么大的开销?由于安全性问题,它会用零填充整个内存区域吗?或者它经过的任何其他关键路径?”

这里是代码和测量结果。预先谢谢你:) !!!

10 int getMilliCount(){ 11 timeb tb; 12 ftime(&tb); 13 int nCount = tb.millitm + (tb.time & 0xfffff) * 1000; 14 return nCount; 15 } 16 17 int getTimeDiff(int baseTime){ 18 int diff = getMilliCount() - baseTime; 19 return diff; 20 } 21 22 __global__ void init(int* x, size_t bytes_){ 23 int num_ = bytes_/sizeof(int); 24 for (int i=0; i<num_; i++){ 25 x[i] = i; 26 } 27 } 28 29 int main(){ 30 printf("sizeof(size_t): %zu\n", sizeof(size_t)); 31 printf("sizeof(unsigned int): %zu\n", sizeof(unsigned int)); 32 printf("sizeof(int): %zu\n", sizeof(int)); 33 printf("sizeof(long): %zu\n", sizeof(long)); 34 35 std::ofstream myfile; 36 myfile.open("output3.csv"); 37 myfile<<"operation, num_bytes, start, end, duration\n"; 38 int baseTime = getMilliCount(); 39 int* dptr; 40 int ts1 = 0; 41 int ts2 = 0; 42 size_t KB = 1024; // start from 1KB 43 int num_trial_ = 1; 44 for (int j=10; j<25; j++){ 45 size_t num_bytes_ = KB<<j; 46 for (int i=0; i<num_trial_; i++){ 47 // measuring cudaMallocManaged 48 ts1 = getTimeDiff(baseTime); >> 49 cudaMallocManaged((void**)&dptr, num_bytes_); 50 ts2 = getTimeDiff(baseTime); 51 myfile<<"cudaMallocManaged, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n"; 52 //printf("cudaMallocManaged, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1); 53 printf("cudaMallocManaged, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1); 54 55 // measuring initialization 56 ts1 = getTimeDiff(baseTime); >> 57 init<<<1,1>>>(dptr, num_bytes_); 58 ts2 = getTimeDiff(baseTime); 59 myfile<<"initialization, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n"; 60 //printf("init, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1); 61 printf("init, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1); 62 63 // measuring cudaFree 64 ts1 = getTimeDiff(baseTime); >> 65 cudaFree(dptr); 66 ts2 = getTimeDiff(baseTime); 67 myfile<<"cudaFree, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n"; 68 //printf("cudaFree, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1); 69 printf("cudaFree, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1); 70 sleep(1); 71 printf("\n"); 72 } 73 } 74 myfile.close(); 75 return 1; 76 }

结果

cudaMallocManaged, memory_size:1MB, duration:360 init, memory_size:1MB, duration:0 cudaFree, memory_size:1MB, **duration:2** cudaMallocManaged, memory_size:2MB, duration:1 init, memory_size:2MB, duration:0 cudaFree, memory_size:2MB, **duration:4** cudaMallocManaged, memory_size:4MB, duration:0 init, memory_size:4MB, duration:0 cudaFree, memory_size:4MB, **duration:9** cudaMallocManaged, memory_size:8MB, duration:0 init, memory_size:8MB, duration:0 cudaFree, memory_size:8MB, **duration:18** cudaMallocManaged, memory_size:16MB, duration:0 init, memory_size:16MB, duration:0 cudaFree, memory_size:16MB, **duration:34** cudaMallocManaged, memory_size:32MB, duration:0 init, memory_size:32MB, duration:0 cudaFree, memory_size:32MB, **duration:69** cudaMallocManaged, memory_size:64MB, duration:0 init, memory_size:64MB, duration:0 cudaFree, memory_size:64MB, **duration:132** cudaMallocManaged, memory_size:128MB, duration:0 init, memory_size:128MB, duration:0 cudaFree, memory_size:128MB, **duration:241** cudaMallocManaged, memory_size:256MB, duration:0 init, memory_size:256MB, duration:0 cudaFree, memory_size:256MB, **duration:476** cudaMallocManaged, memory_size:512MB, duration:0 init, memory_size:512MB, duration:0 cudaFree, memory_size:512MB, **duration:984** cudaMallocManaged, memory_size:1024MB, duration:0 init, memory_size:1024MB, duration:0 cudaFree, memory_size:1024MB, **duration:1910** cudaMallocManaged, memory_size:2048MB, duration:0 init, memory_size:2048MB, duration:1 cudaFree, memory_size:2048MB, **duration:3830** cudaMallocManaged, memory_size:4096MB, duration:0 init, memory_size:4096MB, duration:0 cudaFree, memory_size:4096MB, **duration:7715** cudaMallocManaged, memory_size:8192MB, duration:0 init, memory_size:8192MB, duration:0 cudaFree, memory_size:8192MB, **duration:0** cudaMallocManaged, memory_size:16384MB, duration:0 init, memory_size:16384MB, duration:0 cudaFree, memory_size:16384MB, **duration:0**

    奇怪的是,不仅对于cudaManagedMalloc和init,而且对于cudaFree ...,8192MB和16384MB的显示时间都小于0ms。
  • 请赐教

我观察到cudaFree操作需要花费长时间才能接受。为了验证这一点,我编写了一个简单的微基准测试代码来测试cudaFree开销。它显示了类似的趋势。第一步:...

cuda profiling system
1个回答
0
投票
为什么cudaFree会有这么大的开销?
© www.soinside.com 2019 - 2024. All rights reserved.