我编写了以下程序来检查我的块和每块线程的大小是否在设备的限制内。
它给了我以下输出:
Device 0: Tesla K80
Maximum threads per block: 1024
Maximum blocks per grid : total: 2147614719
Maximum blocks per grid : x dimenson: 2147483647
Maximum blocks per grid : y dimenson: 65535
Maximum blocks per grid : z dimenson: 65535
threads per block : 4096
blocks per grid dimensions : (62500, 62500, 62500)
为什么每个网格的最大块的 x 维度不成比例地大?
源代码
#include <stdio.h>
#include <iostream>
#include <cuda_runtime_api.h>
int main()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
for (int i = 0; i < deviceCount; i++)
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, i);
printf("Device %d: %s\n", i, deviceProp.name);
printf(" Maximum threads per block: %d\n", deviceProp.maxThreadsPerBlock);
}
dim3 maxGridSize;
int deviceID = 0; // use device 0
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceID);
maxGridSize.x = deviceProp.maxGridSize[0];
maxGridSize.y = deviceProp.maxGridSize[1];
maxGridSize.z = deviceProp.maxGridSize[2];
std::cout << "Maximum blocks per grid : total: " << maxGridSize.x * maxGridSize.y * maxGridSize.z << std::endl;
std::cout << " Maximum blocks per grid : x dimenson: " << maxGridSize.x << std::endl;
std::cout << " Maximum blocks per grid : y dimenson: " << maxGridSize.y << std::endl;
std::cout << " Maximum blocks per grid : z dimenson: " << maxGridSize.z << std::endl;
////////////////////////////////////////////
int length = 1000000;
dim3 threads_per_block(16, 16, 16);
dim3 blocks_per_grid((length + threads_per_block.x - 1) / threads_per_block.x,
(length + threads_per_block.y - 1) / threads_per_block.y,
(length + threads_per_block.z - 1) / threads_per_block.z);
printf("\nthreads per block : %d\n", 16 * 16 *16);
printf("\nblocks per grid dimensions : (%d, %d, %d)\n", blocks_per_grid.x, blocks_per_grid.y, blocks_per_grid.z);
////////////////////////////////////////////
return 0;
}