2D 中的 CUDA 内存访问模式

Question

我编写了一个简单的内核，来测试内存访问模式：

__global__ void kernel_A(float *g_data, int dimx, int dimy, int niterations) {
    for (int iy = blockIdx.y * blockDim.y + threadIdx.y; iy < dimy; iy += blockDim.y * gridDim.y) {
        for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < dimx; ix += blockDim.x * gridDim.x) {
            int idx = iy * dimx + ix;

            float value = g_data[idx];

            for (int i = 0; i < niterations; i++) {
                value += __fsqrt_rn(__logf(value) + 1.f);
            }
            g_data[idx] = value;
        }
    }
}

首先我用 1d 块和 1d 网格启动它：

void launchKernel(float * d_data, int dimx, int dimy, int niterations) {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    int num_sms = prop.multiProcessorCount;

    int num_threads = 128;
    int num_blocks;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, kernel_A, num_threads, 0);
    num_blocks *= num_sms;

    dim3 block(num_threads);
    dim3 grid(num_blocks);
    kernel_A<<<grid, block>>>(d_data, dimx, dimy, niterations);
}

我还进行了另一次发布，使用 2d 网格：

void launchKernel(float * d_data, int dimx, int dimy, int niterations) {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    int num_sms = prop.multiProcessorCount;

    int num_threads = 128;
    int num_blocks;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, kernel_A, num_threads, 0);
    num_blocks *= num_sms;

    dim3 block(num_threads);
    dim3 grid(1, num_blocks);
    kernel_A<<<grid, block>>>(d_data, dimx, dimy, niterations);
}

然后我在 RTX 4090 上运行这个基准测试，其中

dimx

和

dimy

都是

8*1024

，相同的

niterations=5

，但我得到了完全不同的结果：

启动 1 使用 4.22 毫秒
启动 2 使用 0.67 毫秒

我不明白，因为从内存访问模式来看，所有线程都很好，每个元素只会读/写一次，所以我不认为GPU L2缓存会受益匪浅。如果我理解正确，L2 缓存读取一个缓存行，每个线程将使用多个事务从全局内存获取数据，我没有看到空间局部性的好处。

Answer 1

如果你打印出所有相关数据并仔细思考，你可能会更好地理解你的代码在做什么。

在第一次启动（“1D”网格）中，所有线程的

iy

从零开始，

中的网格步幅为 1。因此，外循环迭代

dimy

次。在

sm_89

设备上，在 CUDA 12.2 上，最大块占用 API 对我返回 12，这是有道理的。 12x128 = 1536，

sm_89

SM 上可能的最大线程数。您的 RTX 4090 有 128 个 SM。因此，您启动的总块数为 12x128，即 1536 个块。这种情况下的线程总数为 1536x128 = 196,608。这实际上是第一次启动时

中的网格宽度。正如我们已经发现的，

中的网格宽度是1。

那么内核中的 2D 网格步幅循环将如何表现？

在

iy

循环中的第一次迭代中，

中将有 196,608 个线程，但只需要 8x1024=8192 个线程即可覆盖数据集宽度。因此，在您启动的 196,608 个线程中，只有 8192 个会为

iy

的迭代做任何有用的工作，其余的将简单地进行

ix<dimx

测试并且不执行任何操作。这听起来不像是对资源的合理利用。

当然，同样的行为在

iy

的每次迭代中都会体现出来：8192 个线程做了一些有用的事情，而其他线程则没有。因此，如果我们天真地将其应用于 GPU（因为您已调整网格大小以匹配 GPU），那么我们可以说我们正在以有用的方式使用 GPU 的 8192/196,608，即。我们使用了约 4.2% 的 GPU 可用能力。

当然，你可能会说，但是网格步幅循环没有帮助吗？难道那些无用的块不会快速迭代

iy

范围，然后退出吗？是的，它们确实如此，但它们在 GPU 上留下了空白空间。实际数据集中可以完成的工作不会被安排，因为它已经分配给其他线程/块。所以你实际上只保留了 4.2% 的 GPU 来解决这个问题。

在第二个启动案例（“2D”网格）中，情况有所不同。单个网格步长所作用的“补丁”在 x 中并不完全是线性的。它是一个二维补丁，由

(128) 中的线程和

(1536) 中的块组成。我不会完成所有相同的算术，但这种不同形状的“补丁”意味着在网格步长的许多/大多数迭代中，整个网格（所有线程）都在做有用的工作。所以在这种情况下你的 GPU 利用率要高得多。

如果这种治疗有效，那么我们应该观察到平均 SM 利用率的显着差异。我没有 RTX4090 可以使用，但我有一个

sm_89

设备，一个 L4 GPU，带有 58 个 SM。因此它仍然存在“不匹配”的形状问题，但不像具有 128 个 SM 的 RTX4090 那么严重。

SOL 部分中的Nsight 计算包括 SM 利用率的测量，在 CLI 输出中称为“计算 (SM) 吞吐量”。以下是在具有 58 个 SM 的 L4 GPU 上运行代码的样子：

# cat t139.cu
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start=0){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__global__ void kernel_A(float *g_data, int dimx, int dimy, int niterations) {
    for (int iy = blockIdx.y * blockDim.y + threadIdx.y; iy < dimy; iy += blockDim.y * gridDim.y) {
        for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < dimx; ix += blockDim.x * gridDim.x) {
            int idx = iy * dimx + ix;

            float value = g_data[idx];

            for (int i = 0; i < niterations; i++) {
                value += __fsqrt_rn(__logf(value) + 1.f);
            }
            g_data[idx] = value;
        }
    }
}

void launchKernel(float * d_data, int dimx, int dimy, int niterations, int type=0) {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    int num_sms = prop.multiProcessorCount;

    int num_threads = 128;
    int num_blocks;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, kernel_A, num_threads, 0);
    num_blocks *= num_sms;
    std::cout << "num blocks: " << num_blocks << std::endl;
    dim3 block(num_threads);
    dim3 grid;
    if (type)
      grid=dim3(1,num_blocks);
    else
      grid=dim3(num_blocks);
    kernel_A<<<grid, block>>>(d_data, dimx, dimy, niterations);
}

int main(){

  int niterations=5;
  int dimx=8*1024;
  int dimy=8*1024;
  float *d_data;
  cudaMalloc(&d_data, sizeof(*d_data)*dimx*dimy);
  launchKernel(d_data, dimx, dimy, niterations, 0);
  cudaDeviceSynchronize();
  unsigned long long dt = dtime_usec(0);
  launchKernel(d_data, dimx, dimy, niterations, 0);
  cudaDeviceSynchronize();
  dt = dtime_usec(dt);
  std::cout << "elapsed 0: " << dt/(float)USECPSEC << "s" << std::endl;
  dt = dtime_usec(0);
  launchKernel(d_data, dimx, dimy, niterations, 1);
  cudaDeviceSynchronize();
  dt = dtime_usec(dt);
  std::cout << "elapsed 1: " << dt/(float)USECPSEC << "s" << std::endl;
}

# nvcc -o t139 t139.cu -arch=sm_89
# ncu ./t139
==PROF== Connected to process 137887 (/root/bobc/t139)
num blocks: 696
==PROF== Profiling "kernel_A" - 0: 0%....50%....100% - 9 passes
num blocks: 696
==PROF== Profiling "kernel_A" - 1: 0%....50%....100% - 9 passes
elapsed 0: 0.478202s
num blocks: 696
==PROF== Profiling "kernel_A" - 2: 0%....50%....100% - 9 passes
elapsed 1: 0.342003s
==PROF== Disconnected from process 137887
[137887] [email protected]
  kernel_A(float *, int, int, int) (696, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.9
    Section: GPU Speed Of Light Throughput
    ----------------------- ------------- ------------
    Metric Name               Metric Unit Metric Value
    ----------------------- ------------- ------------
    DRAM Frequency          cycle/nsecond         6.25
    SM Frequency            cycle/usecond       827.17
    Elapsed Cycles                  cycle   14,037,832
    Memory Throughput                   %        12.69
    DRAM Throughput                     %        12.69
    Duration                      msecond        16.77
    L1/TEX Cache Throughput             %         3.07
    L2 Cache Throughput                 %         3.12
    SM Active Cycles                cycle 5,878,996.60
    Compute (SM) Throughput             %        17.66
    ----------------------- ------------- ------------

    OPT   This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance
          of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate
          latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.

    Section: Launch Statistics
    -------------------------------- --------------- ---------------
    Metric Name                          Metric Unit    Metric Value
    -------------------------------- --------------- ---------------
    Block Size                                                   128
    Function Cache Configuration                     CachePreferNone
    Grid Size                                                    696
    Registers Per Thread             register/thread              22
    Shared Memory Configuration Size           Kbyte           32.77
    Driver Shared Memory Per Block       Kbyte/block            1.02
    Dynamic Shared Memory Per Block       byte/block               0
    Static Shared Memory Per Block        byte/block               0
    Threads                                   thread          89,088
    Waves Per SM                                                   1
    -------------------------------- --------------- ---------------

    Section: Occupancy
    ------------------------------- ----------- ------------
    Metric Name                     Metric Unit Metric Value
    ------------------------------- ----------- ------------
    Block Limit SM                        block           24
    Block Limit Registers                 block           21
    Block Limit Shared Mem                block           32
    Block Limit Warps                     block           12
    Theoretical Active Warps per SM        warp           48
    Theoretical Occupancy                     %          100
    Achieved Occupancy                        %        35.36
    Achieved Active Warps Per SM           warp        16.97
    ------------------------------- ----------- ------------

    OPT   Estimated Speedup: 64.64%
          This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated
          theoretical (100.0%) and measured achieved occupancy (35.4%) can be the result of warp scheduling overheads
          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block
          as well as across blocks of the same kernel. See the CUDA Best Practices Guide
          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on
          optimizing occupancy.

  kernel_A(float *, int, int, int) (696, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.9
    Section: GPU Speed Of Light Throughput
    ----------------------- ------------- ------------
    Metric Name               Metric Unit Metric Value
    ----------------------- ------------- ------------
    DRAM Frequency          cycle/nsecond         6.15
    SM Frequency            cycle/usecond       798.00
    Elapsed Cycles                  cycle   14,171,756
    Memory Throughput                   %        12.32
    DRAM Throughput                     %        12.32
    Duration                      msecond        17.54
    L1/TEX Cache Throughput             %         3.05
    L2 Cache Throughput                 %         3.08
    SM Active Cycles                cycle 5,926,135.90
    Compute (SM) Throughput             %        17.69
    ----------------------- ------------- ------------

    OPT   This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance
          of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate
          latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.

    Section: Launch Statistics
    -------------------------------- --------------- ---------------
    Metric Name                          Metric Unit    Metric Value
    -------------------------------- --------------- ---------------
    Block Size                                                   128
    Function Cache Configuration                     CachePreferNone
    Grid Size                                                    696
    Registers Per Thread             register/thread              22
    Shared Memory Configuration Size           Kbyte           32.77
    Driver Shared Memory Per Block       Kbyte/block            1.02
    Dynamic Shared Memory Per Block       byte/block               0
    Static Shared Memory Per Block        byte/block               0
    Threads                                   thread          89,088
    Waves Per SM                                                   1
    -------------------------------- --------------- ---------------

    Section: Occupancy
    ------------------------------- ----------- ------------
    Metric Name                     Metric Unit Metric Value
    ------------------------------- ----------- ------------
    Block Limit SM                        block           24
    Block Limit Registers                 block           21
    Block Limit Shared Mem                block           32
    Block Limit Warps                     block           12
    Theoretical Active Warps per SM        warp           48
    Theoretical Occupancy                     %          100
    Achieved Occupancy                        %        35.28
    Achieved Active Warps Per SM           warp        16.93
    ------------------------------- ----------- ------------

    OPT   Estimated Speedup: 64.72%
          This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated
          theoretical (100.0%) and measured achieved occupancy (35.3%) can be the result of warp scheduling overheads
          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block
          as well as across blocks of the same kernel. See the CUDA Best Practices Guide
          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on
          optimizing occupancy.

  kernel_A(float *, int, int, int) (1, 696, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.9
    Section: GPU Speed Of Light Throughput
    ----------------------- ------------- ------------
    Metric Name               Metric Unit Metric Value
    ----------------------- ------------- ------------
    DRAM Frequency          cycle/nsecond         6.24
    SM Frequency            cycle/usecond       806.90
    Elapsed Cycles                  cycle    2,160,446
    Memory Throughput                   %        79.10
    DRAM Throughput                     %        79.10
    Duration                      msecond         2.64
    L1/TEX Cache Throughput             %        16.89
    L2 Cache Throughput                 %        20.03
    SM Active Cycles                cycle 2,083,805.60
    Compute (SM) Throughput             %        74.22
    ----------------------- ------------- ------------

    INF   Compute and Memory are well-balanced: To reduce runtime, both computation and memory traffic must be reduced.
          Check both the Compute Workload Analysis and Memory Workload Analysis sections.

    Section: Launch Statistics
    -------------------------------- --------------- ---------------
    Metric Name                          Metric Unit    Metric Value
    -------------------------------- --------------- ---------------
    Block Size                                                   128
    Function Cache Configuration                     CachePreferNone
    Grid Size                                                    696
    Registers Per Thread             register/thread              22
    Shared Memory Configuration Size           Kbyte           32.77
    Driver Shared Memory Per Block       Kbyte/block            1.02
    Dynamic Shared Memory Per Block       byte/block               0
    Static Shared Memory Per Block        byte/block               0
    Threads                                   thread          89,088
    Waves Per SM                                                   1
    -------------------------------- --------------- ---------------

    Section: Occupancy
    ------------------------------- ----------- ------------
    Metric Name                     Metric Unit Metric Value
    ------------------------------- ----------- ------------
    Block Limit SM                        block           24
    Block Limit Registers                 block           21
    Block Limit Shared Mem                block           32
    Block Limit Warps                     block           12
    Theoretical Active Warps per SM        warp           48
    Theoretical Occupancy                     %          100
    Achieved Occupancy                        %        90.75
    Achieved Active Warps Per SM           warp        43.56
    ------------------------------- ----------- ------------

    INF   This kernel's theoretical occupancy is not impacted by any block limit.

#

我们在第一次和第二次发布中看到，我都将其发布为“类型0”，即。在“1D”情况下，SM 吞吐量约为 17%。在第 3 次启动中，即我的“类型 1”，对应于您的“2D”案例，SM 吞吐量约为 75%。此外，一维情况下的 Nsight 计算“规则”或专家系统输出提到了以下内容：

相对于该设备的峰值性能，该内核的计算吞吐量和内存带宽利用率较低。

低计算吞吐量与“计算（SM）吞吐量”测量直接相关。在 2D 情况下没有给出该符号，而是指出：

计算和内存平衡良好：

与您的 128 个 SM 相比，我的 L4 GPU 只有 58 个 SM，因此形状不匹配问题虽然严重，但并不像您的 RTX 4090 上那么严重。我预计上述比较会更糟（“1D”情况可能有RTX 4090 上的 SM 吞吐量甚至更低。

2D 中的 CUDA 内存访问模式

问题描述投票：0回答：1

1个回答

最新问题

2D 中的 CUDA 内存访问模式

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1