如何在 Nsight Systems 中显示嵌套的 cuda 内核调用

Question

想象我有两个这样的内核：

__global__ void kernel_2(...) {
  // do something
}
__global__ void kernel_1(...) {
  kernel_2<<<n, m>>>(...);
  __syncthreads();
}
int main() {
  kernel_1<<<x, y>>>(...);
  cudaDeviceSynchronize();
}

一切正常，没问题。但我正在尝试在 Nsight 系统中检查这两个内核的运行时间。但我看到的只有

kernel_1

的运行时间，然后是一个空的间隙，我认为间隙是

kernel_2

的运行时间，但那里没有显示这一点。

有什么办法可以让它显示在那里吗？

Answer 1

可以使用 nvtx 手动标记从设备启动的内核的开始和结束。这不会像分析独立内核那样准确，但可以为您提供一些近似的运行时间。

以下代码使用托管内存中的

cuda::latch

来与主机通信内核开始和结束。然后主机会将自定义范围插入 nsys 配置文件中。我用的是 CUDA 12.3。

//nvcc -O3 -arch=sm_86 -std=c++17 -lineinfo -g -rdc=true -lnvToolsExt main.cu -o main
#include <iostream>
#include <cstring>
#include <string>

#include <nvToolsExt.h>
#include <cuda/latch>

using MyLatch = cuda::latch<cuda::thread_scope_system>;

void push_range(const std::string& name){
    nvtxEventAttributes_t eventAttrib;
    std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
    eventAttrib.version = NVTX_VERSION;
    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
    eventAttrib.message.ascii = name.c_str();
    nvtxRangePushEx(&eventAttrib);
}

void pop_range(){
    nvtxRangePop();
}


__global__ 
void kernel_2(MyLatch* latch1, MyLatch* latch2) {

    if(threadIdx.x + blockIdx.x * blockDim.x == 0){
        latch1->count_down();
    }

    for(int i = 0; i < 20; i++){
        __nanosleep(1'000'000);
    }

    if(threadIdx.x + blockIdx.x * blockDim.x == 0){
        latch2->count_down();
    }

}

__global__ 
void kernel_1(MyLatch* latch1, MyLatch* latch2) {
    for(int i = 0; i < 50; i++){
        __nanosleep(1'000'000);
    }

    kernel_2<<<1, 1,0, cudaStreamFireAndForget>>>(latch1, latch2);

    for(int i = 0; i < 50; i++){
        __nanosleep(1'000'000);
    }
}

int main() {

    MyLatch* latch1; cudaMallocManaged(&latch1, sizeof(MyLatch));
    MyLatch* latch2; cudaMallocManaged(&latch2, sizeof(MyLatch));    

    for(int i = 0; i < 5; i++){
        new (latch1) MyLatch(1);
        new (latch2) MyLatch(1);

        kernel_1<<<1, 1>>>(latch1, latch2);

        latch1->wait();
        push_range("kernel_2");
        latch2->wait();
        pop_range();

        cudaDeviceSynchronize();
    }
}

使用 nsight Systems 2023.3.1 对其进行分析显示了 kernel_2 的自定义标记。

如何在 Nsight Systems 中显示嵌套的 cuda 内核调用

问题描述投票：0回答：1

1个回答

最新问题

如何在 Nsight Systems 中显示嵌套的 cuda 内核调用

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1