有没有办法任意阻止和取消阻止 CUDA 流?

问题描述 投票:0回答:1

我需要从程序一部分的某个点暂停流中所有调用的执行,直到程序的另一部分决定在任意时间取消暂停该流。这是我正在处理的应用程序的要求,我无法解决这个问题。理想情况下,我想使用图形 API(例如

cudaGraphAddMemcpyNode
),但如果图形由于某种原因无法执行此操作,则常规异步调用(例如
cudaMemcpyAsync
)也是可以接受的。通过阅读 CUDA 的文档,我认为有一种明显的方法可以做到这一点,但事实证明它要复杂得多。

这是我的第一次尝试,提炼成一个简单的例子:

    cudaGraphCreate(&cuda_graph_cpy, 0);
    cudaGraphAddMemcpyNode1D(&memcpy_h2d_node, cuda_graph_cpy, NULL, 0, device_buf, host_buf, BUF_SIZE * sizeof(char), cudaMemcpyDefault);
    cudaGraphAddEventWaitNode(&wait_node, cuda_graph_cpy, &memcpy_h2d_node, 1, cuda_event);
    cudaGraphAddMemcpyNode1D(&memcpy_d2h_node, cuda_graph_cpy, &wait_node, 1, host_buf, device_buf, BUF_SIZE * sizeof(char), cudaMemcpyDefault);
    cudaGraphInstantiate(&cuda_graph_exec_cpy, cuda_graph_cpy, NULL, NULL, 0);

    cudaGraphCreate(&cuda_graph_set, 0);
    cudaGraphAddMemsetNode(&memset_node, cuda_graph_set, NULL, 0, &memset_params);
    cudaGraphAddEventRecordNode(&record_set_node, cuda_graph_set, &memset_node, 1, cuda_event);
    cudaGraphInstantiate(&cuda_graph_exec_set, cuda_graph_set, NULL, NULL, 0);

    cudaGraphLaunch(cuda_graph_exec_cpy, cuda_stream_cpy);
    cudaGraphLaunch(cuda_graph_exec_set, cuda_stream_set);
    cudaStreamSynchronize(cuda_stream_cpy);

所以我创建并实例化了一个线性图:进行主机到设备的复制,等待

cuda_event
,进行设备到主机的复制。然后我创建并实例化另一个线性图:在设备内存上执行 memset,记录
cuda_event
.

之后,我在

cuda_stream_cpy
上启动第一个图,然后在
cuda_stream_set
上启动第二个图,然后在
cuda_stream_cpy
上同步。

最后我希望修改

host_buf
,但它没有被修改,因为第一个图/流实际上没有等待任何东西并立即进行第二个副本。

在使用常规异步调用而不是图表重写代码并获得相同的行为后,阅读了我在谷歌中可以找到的关于这个主题的所有内容,并尝试使用标志并在不同的地方添加更多

cudaEventRecord
/
cudaGraphAddEventRecordNode
调用,我意识到CUDA 的事件语义似乎无法实现我需要的行为?问题似乎是记录和等待调用必须大约同时进行,并且不可能将它们分离。如果还没有事件记录入队,则等待异步调用或图形节点不会阻塞流,并且流继续进行。

所以我想做的是替换上面代码示例中的

cudaGraphAddEventWaitNode
/
cudaGraphAddEventRecordNode
,或者向示例中添加一些内容,以便代码按照我描述的方式工作:等待节点实际上阻塞流直到记录节点(或其替代品?)解锁它
.

我还在 CUDA 中发现了一种叫做“外部信号量”的东西,它可以做我想做的事(用

cudaGraphAddExternalSemaphoresWaitNode
/
cudaGraphAddExternalSemaphoresSignalNode
代替)但它们似乎不可能在不使用 Vulkan 或 DirectX 的情况下创建,这是我的东西不能带入申请。我试图将共享内存对象的文件描述符传递给
cudaImportExternalSemaphore
for
cudaExternalSemaphoreHandleTypeOpaqueFd
,但这没有用。

编辑: 我尝试将@RobertCrovella 建议的等待内核集成到我的原型中,但它卡在了第一个图形的启动中。这是复制器:

#include "cuda_runtime_api.h"

#include <stdio.h>
#include <stdlib.h>

#define BUF_SIZE 1024
#define TEST_POS_OLD 512
#define TEST_POS_NEW 10
#define OLD_VAL 5
#define NEW_VAL 23

#define CUDA_CHKERR(x) res = x; if (res != cudaSuccess) goto fail;


__global__ void wait_kernel(volatile unsigned char *event, unsigned char val)
{
    while (*event == val);
}

int main()
{
    cudaError_t res = cudaSuccess;
    const char *err_str = NULL;
    const char *err_name = NULL;

    cudaStream_t cuda_stream_cpy;
    cudaStream_t cuda_stream_set;

    cudaGraph_t cuda_graph_cpy;
    cudaGraphExec_t cuda_graph_exec_cpy;
    cudaGraph_t cuda_graph_set;
    cudaGraphExec_t cuda_graph_exec_set;

    cudaGraphNode_t memcpy_h2d_node;
    cudaGraphNode_t memcpy_d2h_node;
    cudaGraphNode_t memset_node;
    cudaGraphNode_t signal_node;
    cudaGraphNode_t wait_node;

    unsigned char *event;
    unsigned char test = 0;

    dim3 grid(1,1,1);
    dim3 block(1,1,1);
    struct cudaKernelNodeParams kernel_node_params = {};
    struct cudaMemsetParams memset_params = {};
    void *wait_kernel_args[2] = {(void *) &event, (void *) &test};

    char *host_buf = NULL;
    void *device_buf = NULL;

    printf("Creating the event...\n");
    CUDA_CHKERR(cudaMalloc(&event, sizeof(event[0])));
    printf("cudaMalloc\n");
    CUDA_CHKERR(cudaMemset(event, 0, sizeof(event[0])));
    printf("cudaMemset\n");

    printf("Allocating the host buffer and setting the test value...\n");
    host_buf = (char *) malloc(BUF_SIZE * sizeof(char));
    for (int i = 0; i < BUF_SIZE; i++) {
        host_buf[i] = OLD_VAL;
    }

    CUDA_CHKERR(cudaMalloc(&device_buf, BUF_SIZE * sizeof(char)));
    printf("cudaMalloc\n");

    CUDA_CHKERR(cudaStreamCreate(&cuda_stream_cpy));
    printf("cudaStreamCreate cpy\n");
    CUDA_CHKERR(cudaStreamCreate(&cuda_stream_set));
    printf("cudaStreamCreate set\n");

    CUDA_CHKERR(cudaGraphCreate(&cuda_graph_cpy, 0));
    printf("cudaGraphCreate cpy\n");

    CUDA_CHKERR(cudaGraphAddMemcpyNode1D(&memcpy_h2d_node, cuda_graph_cpy, NULL, 0, device_buf, host_buf, BUF_SIZE * sizeof(char), cudaMemcpyDefault));
    printf("cudaGraphAddMemcpyNode1D H2D\n");
    memset(&kernel_node_params, 0, sizeof(cudaKernelNodeParams));
    kernel_node_params.func = (void *)wait_kernel;
    kernel_node_params.gridDim = grid;
    kernel_node_params.blockDim = block;
    kernel_node_params.sharedMemBytes = 0;
    kernel_node_params.kernelParams = wait_kernel_args;
    kernel_node_params.extra = NULL;
    CUDA_CHKERR(cudaGraphAddKernelNode(&wait_node, cuda_graph_cpy, &memcpy_h2d_node, 1, &kernel_node_params));
    printf("cudaGraphAddKernelNode (wait)\n");
    CUDA_CHKERR(cudaGraphAddMemcpyNode1D(&memcpy_d2h_node, cuda_graph_cpy, &wait_node, 1, host_buf, device_buf, BUF_SIZE * sizeof(char), cudaMemcpyDefault));
    printf("cudaGraphAddMemcpyNode1D D2H\n");

    CUDA_CHKERR(cudaGraphInstantiate(&cuda_graph_exec_cpy, cuda_graph_cpy, NULL, NULL, 0));
    printf("cudaGraphInstantiate cpy\n");

    CUDA_CHKERR(cudaGraphCreate(&cuda_graph_set, 0));
    printf("cudaGraphCreate set\n");

    memset(&memset_params, 0, sizeof(cudaMemsetParams));
    memset_params.dst = device_buf;
    memset_params.value = NEW_VAL;
    memset_params.pitch = 0;
    memset_params.elementSize = sizeof(char);
    memset_params.width = 512;
    memset_params.height = 1;
    CUDA_CHKERR(cudaGraphAddMemsetNode(&memset_node, cuda_graph_set, NULL, 0, &memset_params));
    printf("cudaGraphAddMemsetNode\n");
    memset(&memset_params, 0, sizeof(cudaMemsetParams));
    memset_params.dst = event;
    memset_params.value = 1;
    memset_params.pitch = 0;
    memset_params.elementSize = 1;
    memset_params.width = 1;
    memset_params.height = 1;
    CUDA_CHKERR(cudaGraphAddMemsetNode(&signal_node, cuda_graph_set, &memset_node, 1, &memset_params));
    printf("cudaGraphAddMemsetNode (signal)\n");

    CUDA_CHKERR(cudaGraphInstantiate(&cuda_graph_exec_set, cuda_graph_set, NULL, NULL, 0));
    printf("cudaGraphInstantiate set\n");

    CUDA_CHKERR(cudaGraphLaunch(cuda_graph_exec_cpy, cuda_stream_cpy));
    printf("cudaGraphLaunch cpy\n");
    CUDA_CHKERR(cudaGraphLaunch(cuda_graph_exec_set, cuda_stream_set));
    printf("cudaGraphLaunch set\n");
    CUDA_CHKERR(cudaStreamSynchronize(cuda_stream_cpy));
    printf("cudaStreamSynchronize cpy\n");

    CUDA_CHKERR(cudaGraphExecDestroy(cuda_graph_exec_cpy));
    printf("cudaGraphExecDestroy\n");
    CUDA_CHKERR(cudaGraphExecDestroy(cuda_graph_exec_set));
    printf("cudaGraphExecDestroy\n");
    CUDA_CHKERR(cudaGraphDestroy(cuda_graph_cpy));
    printf("cudaGraphDestroy\n");
    CUDA_CHKERR(cudaGraphDestroy(cuda_graph_set));
    printf("cudaGraphDestroy\n");

    CUDA_CHKERR(cudaStreamDestroy(cuda_stream_cpy));
    printf("cudaStreamDestroy cpy\n");
    CUDA_CHKERR(cudaStreamDestroy(cuda_stream_set));
    printf("cudaStreamDestroy set\n");

    if (host_buf[TEST_POS_OLD] == OLD_VAL) {
        printf("host_buf[TEST_POS_OLD] is correct.\n");
    } else {
        printf("host_buf[TEST_POS_OLD] is not correct!\n");
    }
    if (host_buf[TEST_POS_NEW] == NEW_VAL) {
        printf("host_buf[TEST_POS_NEW] is correct.\n");
    } else {
        printf("host_buf[TEST_POS_NEW] is not correct!\n");
        if (host_buf[TEST_POS_OLD] == host_buf[TEST_POS_NEW]) printf("They are equal!\n");
    }

    return 0;
 fail:
    err_name = cudaGetErrorName(res);
    err_str = cudaGetErrorString(res);
    printf("%s: %s\n", err_name, err_str);
    return 1;
}
cuda synchronization gpgpu cuda-streams cuda-events
1个回答
1
投票

尽管评论另有说明,但您实际上是在两个图之间建立依赖关系。通过一些重构,我的建议是将这些活动组合成一个图,并使用图捕获中可用的机制来表达依赖关系。

然而目标:

  1. 两张图
  2. (显然)使用图形 API(不是流捕获)

我们可以以类似于提到的流备忘录的方式实现这一点(在评论中,仅在驱动程序 API 中可用)。基本上,我们创建一个等待设备内存位置的内核,以将一个图同步到另一个图。正在等待的图形将启动内核进行同步。另一个图形通过 memset 节点发送信号。

举个例子:

$ cat t2217.cu
#include <iostream>
#include <vector>
#include <cstdio>
#include <cstdlib>
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void calc1kernel(float *data, float val, size_t n){

  size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
  while (idx < n){
    data[idx] += val;
    idx += gridDim.x*blockDim.x;}
}
__global__ void calc2kernel(float *data, float val, size_t n){

  size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
  while (idx < n){
    data[idx] *= val;
    idx += gridDim.x*blockDim.x;}
}
__global__ void waitkernel(volatile unsigned char *signal, unsigned char val){

  while (*signal == val);
}

// CUDA Graph 1:
//                calc1kernelnode
//                     |
//                memsetnode
// CUDA Graph 2:
//                waitkernel
//                     |
//                calc2kernelnode


int main(int argc, char *argv[]){

  size_t data_size = 32;
  cudaStream_t s1, s2;
  cudaGraph_t g1, g2;
  float *data, val;
  unsigned char *sig;
  // allocate for data on the device
  cudaMalloc(&data, data_size*sizeof(data[0]));
  cudaCheckErrors("CUDAMalloc failure");
  cudaMalloc(&sig, sizeof(sig[0]));
  cudaCheckErrors("CUDAMalloc failure");
  cudaMemset(sig, 0, sizeof(sig[0]));
  cudaCheckErrors("CUDAMemset failure");
  cudaMemset(data, 0, data_size*sizeof(data[0]));
  cudaCheckErrors("CUDAMemset failure");
  // create the graph
  cudaGraphCreate(&g1, 0);
  cudaCheckErrors("CUDAGraphCreate failure");
  cudaGraphCreate(&g2, 0);
  cudaCheckErrors("CUDAGraphCreate failure");
  cudaStreamCreate(&s1);
  cudaCheckErrors("CUDAStreamCreate failure");
  cudaStreamCreate(&s2);
  cudaCheckErrors("CUDAStreamCreate failure");
  dim3 grid(1,1,1);
  dim3 block(1,1,1);
  cudaGraphNode_t calc1kernelnode, calc2kernelnode, waitkernelnode, memsetnode;
  // add nodes and their dependencies to the first graph
  cudaKernelNodeParams kernelNodeParams = {0};
  // first add calc1kernelnode, which has no dependencies
  val = 3.0f;
  memset(&kernelNodeParams, 0, sizeof(cudaKernelNodeParams));
  void *kernelargs[3] = {(void *)&data, (void *)&val, (void *)&data_size};
  kernelNodeParams.func = (void *)calc1kernel;
  kernelNodeParams.gridDim = grid;
  kernelNodeParams.blockDim = block;
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = kernelargs;
  kernelNodeParams.extra = NULL;
  cudaGraphAddKernelNode(&calc1kernelnode, g1, NULL, 0, &kernelNodeParams);
  cudaCheckErrors("CUDAGraphAddKernelNode failure");
  // now add the memsetnode, which has 1 dependency on calc1kernelnode
  cudaMemsetParams memsetParams = {0};
  memset(&memsetParams, 0, sizeof(cudaMemsetParams));
  memsetParams.dst = sig;
  memsetParams.elementSize = 1;
  memsetParams.height = 1;
  memsetParams.pitch = 1;
  memsetParams.value = 1;
  memsetParams.width = 1;
  cudaGraphAddMemsetNode(&memsetnode, g1, &calc1kernelnode, 1, &memsetParams);
  cudaCheckErrors("CUDAGraphAddMemsetNode failure");
  // graph 1 is now defined, next step is to instantiate an executable version of it
  size_t num_nodes = 0;
  cudaGraphNode_t *nodes1 = NULL;
  cudaGraphGetNodes(g1, nodes1, &num_nodes);
  cudaCheckErrors("CUDAGraphGetNodes failure");
  printf("graph 1 num nodes: %lu\n", num_nodes);
  cudaGraphExec_t graphExec1, graphExec2;
  cudaGraphInstantiate(&graphExec1, g1, NULL, NULL, 0);
  cudaCheckErrors("CUDAGraphInstantiate failure");
  // add nodes and their dependencies to the second graph
  // first add waitkernelnode, which has no dependencies
  unsigned char test = 0;
  memset(&kernelNodeParams, 0, sizeof(cudaKernelNodeParams));
  void *waitkernelargs[2] = {(void *) &sig, (void *) &test };
  kernelNodeParams.func = (void *)waitkernel;
  kernelNodeParams.gridDim = grid;
  kernelNodeParams.blockDim = block;
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = waitkernelargs;
  kernelNodeParams.extra = NULL;
  cudaGraphAddKernelNode(&waitkernelnode, g2, NULL, 0, &kernelNodeParams);
  cudaCheckErrors("CUDAGraphAddKernelNode failure");
  // now add the calc2kernelnode, which has 1 dependency on waitkernelnode
  memset(&kernelNodeParams, 0, sizeof(cudaKernelNodeParams));
  kernelNodeParams.func = (void *)calc2kernel;
  kernelNodeParams.gridDim = grid;
  kernelNodeParams.blockDim = block;
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = kernelargs;
  kernelNodeParams.extra = NULL;
  cudaGraphAddKernelNode(&calc2kernelnode, g2, &waitkernelnode, 1, &kernelNodeParams);
  cudaCheckErrors("CUDAGraphAddKernelNode failure");
  // graph 2 is now defined, next step is to instantiate an executable version of it
  cudaGraphNode_t *nodes2 = NULL;
  cudaGraphGetNodes(g2, nodes2, &num_nodes);
  cudaCheckErrors("CUDAGraphGetNodes failure");
  printf("graph 2 num nodes: %lu\n", num_nodes);
  cudaGraphInstantiate(&graphExec2, g2, NULL, NULL, 0);
  cudaCheckErrors("CUDAGraphInstantiate failure");
  // now launch the graphs
  cudaGraphLaunch(graphExec2, s2);
  cudaCheckErrors("CUDAGraphLaunch failure");
  cudaGraphLaunch(graphExec1, s1);
  cudaCheckErrors("CUDAGraphLaunch failure");
  cudaStreamSynchronize(s1);
  cudaCheckErrors("graph execution failure");
  cudaStreamSynchronize(s2);
  cudaCheckErrors("graph execution failure");
  float *result = new float[data_size];
  cudaMemcpy(result, data, data_size*sizeof(float), cudaMemcpyDeviceToHost);
  std::cout << "result[0] = " << result[0] << std::endl;
  // clean up
  cudaFree(data);
  cudaStreamDestroy(s1);
  cudaGraphDestroy(g1);
  cudaGraphExecDestroy(graphExec1);
  cudaStreamDestroy(s2);
  cudaGraphDestroy(g2);
  cudaGraphExecDestroy(graphExec2);
}
$ nvcc -o t2217 t2217.cu
$ ./t2217
graph 1 num nodes: 2
graph 2 num nodes: 2
result[0] = 9
$

9的结果表明,即使图2先启动,它也成功地等待到图1中的同步点,然后才允许其计算内核运行。

给出的示例(在问题中)显示了运行时 API 的使用。

如果你想使用驱动程序 API,正如评论中已经指出的那样,应该可以通过 batched memops 使用 cuGraphAddBatchMemOpNode 直接执行此操作。

这种互锁如果使用不当会导致挂起和死锁。注意给出的各种警告:

警告:此API使用不当可能导致应用程序死锁。通过此 API 建立的同步顺序对 CUDA 不可见。由该 API 排序(甚至间接排序)的 CUDA 任务也应该具有用 CUDA 可见的依赖项(例如事件)表示的顺序。 ...

© www.soinside.com 2019 - 2024. All rights reserved.