我观察到cudaFree操作需要花费长时间才能接受。为了验证这一点,我编写了一个简单的微基准测试代码来测试cudaFree开销。它显示出类似的趋势。
第一步:cudaMallocManaged
不需要那么长时间。实际上,仅对于cudaMallocManaged,它小于1ms。
第二步:init
要初始化分配的内存区域,将调用init函数。而且它也不会少于1毫秒。
第三步:cudaFree
现在,出现开销。 需要很多时间。还有一件事情是更大的内存大小,需要更长的时间。(漂亮地线性排列)
[[问题是“为什么cudaFree会有这么大的开销?由于安全性问题,它会用零填充整个内存区域吗?或者它经过的任何其他关键路径?”
这里是代码和测量结果。预先谢谢你:) !!! 10 int getMilliCount(){
11 timeb tb;
12 ftime(&tb);
13 int nCount = tb.millitm + (tb.time & 0xfffff) * 1000;
14 return nCount;
15 }
16
17 int getTimeDiff(int baseTime){
18 int diff = getMilliCount() - baseTime;
19 return diff;
20 }
21
22 __global__ void init(int* x, size_t bytes_){
23 int num_ = bytes_/sizeof(int);
24 for (int i=0; i<num_; i++){
25 x[i] = i;
26 }
27 }
28
29 int main(){
30 printf("sizeof(size_t): %zu\n", sizeof(size_t));
31 printf("sizeof(unsigned int): %zu\n", sizeof(unsigned int));
32 printf("sizeof(int): %zu\n", sizeof(int));
33 printf("sizeof(long): %zu\n", sizeof(long));
34
35 std::ofstream myfile;
36 myfile.open("output3.csv");
37 myfile<<"operation, num_bytes, start, end, duration\n";
38 int baseTime = getMilliCount();
39 int* dptr;
40 int ts1 = 0;
41 int ts2 = 0;
42 size_t KB = 1024; // start from 1KB
43 int num_trial_ = 1;
44 for (int j=10; j<25; j++){
45 size_t num_bytes_ = KB<<j;
46 for (int i=0; i<num_trial_; i++){
47 // measuring cudaMallocManaged
48 ts1 = getTimeDiff(baseTime);
>> 49 cudaMallocManaged((void**)&dptr, num_bytes_);
50 ts2 = getTimeDiff(baseTime);
51 myfile<<"cudaMallocManaged, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
52 //printf("cudaMallocManaged, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
53 printf("cudaMallocManaged, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
54
55 // measuring initialization
56 ts1 = getTimeDiff(baseTime);
>> 57 init<<<1,1>>>(dptr, num_bytes_);
58 ts2 = getTimeDiff(baseTime);
59 myfile<<"initialization, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
60 //printf("init, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
61 printf("init, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
62
63 // measuring cudaFree
64 ts1 = getTimeDiff(baseTime);
>> 65 cudaFree(dptr);
66 ts2 = getTimeDiff(baseTime);
67 myfile<<"cudaFree, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
68 //printf("cudaFree, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
69 printf("cudaFree, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
70 sleep(1);
71 printf("\n");
72 }
73 }
74 myfile.close();
75 return 1;
76 }
结果
cudaMallocManaged, memory_size:1MB, duration:360 init, memory_size:1MB, duration:0 cudaFree, memory_size:1MB, **duration:2** cudaMallocManaged, memory_size:2MB, duration:1 init, memory_size:2MB, duration:0 cudaFree, memory_size:2MB, **duration:4** cudaMallocManaged, memory_size:4MB, duration:0 init, memory_size:4MB, duration:0 cudaFree, memory_size:4MB, **duration:9** cudaMallocManaged, memory_size:8MB, duration:0 init, memory_size:8MB, duration:0 cudaFree, memory_size:8MB, **duration:18** cudaMallocManaged, memory_size:16MB, duration:0 init, memory_size:16MB, duration:0 cudaFree, memory_size:16MB, **duration:34** cudaMallocManaged, memory_size:32MB, duration:0 init, memory_size:32MB, duration:0 cudaFree, memory_size:32MB, **duration:69** cudaMallocManaged, memory_size:64MB, duration:0 init, memory_size:64MB, duration:0 cudaFree, memory_size:64MB, **duration:132** cudaMallocManaged, memory_size:128MB, duration:0 init, memory_size:128MB, duration:0 cudaFree, memory_size:128MB, **duration:241** cudaMallocManaged, memory_size:256MB, duration:0 init, memory_size:256MB, duration:0 cudaFree, memory_size:256MB, **duration:476** cudaMallocManaged, memory_size:512MB, duration:0 init, memory_size:512MB, duration:0 cudaFree, memory_size:512MB, **duration:984** cudaMallocManaged, memory_size:1024MB, duration:0 init, memory_size:1024MB, duration:0 cudaFree, memory_size:1024MB, **duration:1910** cudaMallocManaged, memory_size:2048MB, duration:0 init, memory_size:2048MB, duration:1 cudaFree, memory_size:2048MB, **duration:3830** cudaMallocManaged, memory_size:4096MB, duration:0 init, memory_size:4096MB, duration:0 cudaFree, memory_size:4096MB, **duration:7715** cudaMallocManaged, memory_size:8192MB, duration:0 init, memory_size:8192MB, duration:0 cudaFree, memory_size:8192MB, **duration:0** cudaMallocManaged, memory_size:16384MB, duration:0 init, memory_size:16384MB, duration:0 cudaFree, memory_size:16384MB, **duration:0**
我观察到cudaFree操作需要花费长时间才能接受。为了验证这一点,我编写了一个简单的微基准测试代码来测试cudaFree开销。它显示了类似的趋势。第一步:...
为什么cudaFree会有这么大的开销?