我需要从程序一部分的某个点暂停流中所有调用的执行,直到程序的另一部分决定在任意时间取消暂停该流。这是我正在处理的应用程序的要求,我无法解决这个问题。理想情况下,我想使用图形 API(例如
cudaGraphAddMemcpyNode
),但如果图形由于某种原因无法执行此操作,则常规异步调用(例如cudaMemcpyAsync
)也是可以接受的。通过阅读 CUDA 的文档,我认为有一种明显的方法可以做到这一点,但事实证明它要复杂得多。
这是我的第一次尝试,提炼成一个简单的例子:
cudaGraphCreate(&cuda_graph_cpy, 0);
cudaGraphAddMemcpyNode1D(&memcpy_h2d_node, cuda_graph_cpy, NULL, 0, device_buf, host_buf, BUF_SIZE * sizeof(char), cudaMemcpyDefault);
cudaGraphAddEventWaitNode(&wait_node, cuda_graph_cpy, &memcpy_h2d_node, 1, cuda_event);
cudaGraphAddMemcpyNode1D(&memcpy_d2h_node, cuda_graph_cpy, &wait_node, 1, host_buf, device_buf, BUF_SIZE * sizeof(char), cudaMemcpyDefault);
cudaGraphInstantiate(&cuda_graph_exec_cpy, cuda_graph_cpy, NULL, NULL, 0);
cudaGraphCreate(&cuda_graph_set, 0);
cudaGraphAddMemsetNode(&memset_node, cuda_graph_set, NULL, 0, &memset_params);
cudaGraphAddEventRecordNode(&record_set_node, cuda_graph_set, &memset_node, 1, cuda_event);
cudaGraphInstantiate(&cuda_graph_exec_set, cuda_graph_set, NULL, NULL, 0);
cudaGraphLaunch(cuda_graph_exec_cpy, cuda_stream_cpy);
cudaGraphLaunch(cuda_graph_exec_set, cuda_stream_set);
cudaStreamSynchronize(cuda_stream_cpy);
所以我创建并实例化了一个线性图:进行主机到设备的复制,等待
cuda_event
,进行设备到主机的复制。然后我创建并实例化另一个线性图:在设备内存上执行 memset,记录cuda_event
.
之后,我在
cuda_stream_cpy
上启动第一个图,然后在 cuda_stream_set
上启动第二个图,然后在 cuda_stream_cpy
上同步。
最后我希望修改
host_buf
,但它没有被修改,因为第一个图/流实际上没有等待任何东西并立即进行第二个副本。
在使用常规异步调用而不是图表重写代码并获得相同的行为后,阅读了我在谷歌中可以找到的关于这个主题的所有内容,并尝试使用标志并在不同的地方添加更多
cudaEventRecord
/cudaGraphAddEventRecordNode
调用,我意识到CUDA 的事件语义似乎无法实现我需要的行为?问题似乎是记录和等待调用必须大约同时进行,并且不可能将它们分离。如果还没有事件记录入队,则等待异步调用或图形节点不会阻塞流,并且流继续进行。
所以我想做的是替换上面代码示例中的
cudaGraphAddEventWaitNode
/cudaGraphAddEventRecordNode
,或者向示例中添加一些内容,以便代码按照我描述的方式工作:等待节点实际上阻塞流直到记录节点(或其替代品?)解锁它.
我还在 CUDA 中发现了一种叫做“外部信号量”的东西,它可以做我想做的事(用
cudaGraphAddExternalSemaphoresWaitNode
/cudaGraphAddExternalSemaphoresSignalNode
代替)但它们似乎不可能在不使用 Vulkan 或 DirectX 的情况下创建,这是我的东西不能带入申请。我试图将共享内存对象的文件描述符传递给 cudaImportExternalSemaphore
for cudaExternalSemaphoreHandleTypeOpaqueFd
,但这没有用。
编辑: 我尝试将@RobertCrovella 建议的等待内核集成到我的原型中,但它卡在了第一个图形的启动中。这是复制器:
#include "cuda_runtime_api.h"
#include <stdio.h>
#include <stdlib.h>
#define BUF_SIZE 1024
#define TEST_POS_OLD 512
#define TEST_POS_NEW 10
#define OLD_VAL 5
#define NEW_VAL 23
#define CUDA_CHKERR(x) res = x; if (res != cudaSuccess) goto fail;
__global__ void wait_kernel(volatile unsigned char *event, unsigned char val)
{
while (*event == val);
}
int main()
{
cudaError_t res = cudaSuccess;
const char *err_str = NULL;
const char *err_name = NULL;
cudaStream_t cuda_stream_cpy;
cudaStream_t cuda_stream_set;
cudaGraph_t cuda_graph_cpy;
cudaGraphExec_t cuda_graph_exec_cpy;
cudaGraph_t cuda_graph_set;
cudaGraphExec_t cuda_graph_exec_set;
cudaGraphNode_t memcpy_h2d_node;
cudaGraphNode_t memcpy_d2h_node;
cudaGraphNode_t memset_node;
cudaGraphNode_t signal_node;
cudaGraphNode_t wait_node;
unsigned char *event;
unsigned char test = 0;
dim3 grid(1,1,1);
dim3 block(1,1,1);
struct cudaKernelNodeParams kernel_node_params = {};
struct cudaMemsetParams memset_params = {};
void *wait_kernel_args[2] = {(void *) &event, (void *) &test};
char *host_buf = NULL;
void *device_buf = NULL;
printf("Creating the event...\n");
CUDA_CHKERR(cudaMalloc(&event, sizeof(event[0])));
printf("cudaMalloc\n");
CUDA_CHKERR(cudaMemset(event, 0, sizeof(event[0])));
printf("cudaMemset\n");
printf("Allocating the host buffer and setting the test value...\n");
host_buf = (char *) malloc(BUF_SIZE * sizeof(char));
for (int i = 0; i < BUF_SIZE; i++) {
host_buf[i] = OLD_VAL;
}
CUDA_CHKERR(cudaMalloc(&device_buf, BUF_SIZE * sizeof(char)));
printf("cudaMalloc\n");
CUDA_CHKERR(cudaStreamCreate(&cuda_stream_cpy));
printf("cudaStreamCreate cpy\n");
CUDA_CHKERR(cudaStreamCreate(&cuda_stream_set));
printf("cudaStreamCreate set\n");
CUDA_CHKERR(cudaGraphCreate(&cuda_graph_cpy, 0));
printf("cudaGraphCreate cpy\n");
CUDA_CHKERR(cudaGraphAddMemcpyNode1D(&memcpy_h2d_node, cuda_graph_cpy, NULL, 0, device_buf, host_buf, BUF_SIZE * sizeof(char), cudaMemcpyDefault));
printf("cudaGraphAddMemcpyNode1D H2D\n");
memset(&kernel_node_params, 0, sizeof(cudaKernelNodeParams));
kernel_node_params.func = (void *)wait_kernel;
kernel_node_params.gridDim = grid;
kernel_node_params.blockDim = block;
kernel_node_params.sharedMemBytes = 0;
kernel_node_params.kernelParams = wait_kernel_args;
kernel_node_params.extra = NULL;
CUDA_CHKERR(cudaGraphAddKernelNode(&wait_node, cuda_graph_cpy, &memcpy_h2d_node, 1, &kernel_node_params));
printf("cudaGraphAddKernelNode (wait)\n");
CUDA_CHKERR(cudaGraphAddMemcpyNode1D(&memcpy_d2h_node, cuda_graph_cpy, &wait_node, 1, host_buf, device_buf, BUF_SIZE * sizeof(char), cudaMemcpyDefault));
printf("cudaGraphAddMemcpyNode1D D2H\n");
CUDA_CHKERR(cudaGraphInstantiate(&cuda_graph_exec_cpy, cuda_graph_cpy, NULL, NULL, 0));
printf("cudaGraphInstantiate cpy\n");
CUDA_CHKERR(cudaGraphCreate(&cuda_graph_set, 0));
printf("cudaGraphCreate set\n");
memset(&memset_params, 0, sizeof(cudaMemsetParams));
memset_params.dst = device_buf;
memset_params.value = NEW_VAL;
memset_params.pitch = 0;
memset_params.elementSize = sizeof(char);
memset_params.width = 512;
memset_params.height = 1;
CUDA_CHKERR(cudaGraphAddMemsetNode(&memset_node, cuda_graph_set, NULL, 0, &memset_params));
printf("cudaGraphAddMemsetNode\n");
memset(&memset_params, 0, sizeof(cudaMemsetParams));
memset_params.dst = event;
memset_params.value = 1;
memset_params.pitch = 0;
memset_params.elementSize = 1;
memset_params.width = 1;
memset_params.height = 1;
CUDA_CHKERR(cudaGraphAddMemsetNode(&signal_node, cuda_graph_set, &memset_node, 1, &memset_params));
printf("cudaGraphAddMemsetNode (signal)\n");
CUDA_CHKERR(cudaGraphInstantiate(&cuda_graph_exec_set, cuda_graph_set, NULL, NULL, 0));
printf("cudaGraphInstantiate set\n");
CUDA_CHKERR(cudaGraphLaunch(cuda_graph_exec_cpy, cuda_stream_cpy));
printf("cudaGraphLaunch cpy\n");
CUDA_CHKERR(cudaGraphLaunch(cuda_graph_exec_set, cuda_stream_set));
printf("cudaGraphLaunch set\n");
CUDA_CHKERR(cudaStreamSynchronize(cuda_stream_cpy));
printf("cudaStreamSynchronize cpy\n");
CUDA_CHKERR(cudaGraphExecDestroy(cuda_graph_exec_cpy));
printf("cudaGraphExecDestroy\n");
CUDA_CHKERR(cudaGraphExecDestroy(cuda_graph_exec_set));
printf("cudaGraphExecDestroy\n");
CUDA_CHKERR(cudaGraphDestroy(cuda_graph_cpy));
printf("cudaGraphDestroy\n");
CUDA_CHKERR(cudaGraphDestroy(cuda_graph_set));
printf("cudaGraphDestroy\n");
CUDA_CHKERR(cudaStreamDestroy(cuda_stream_cpy));
printf("cudaStreamDestroy cpy\n");
CUDA_CHKERR(cudaStreamDestroy(cuda_stream_set));
printf("cudaStreamDestroy set\n");
if (host_buf[TEST_POS_OLD] == OLD_VAL) {
printf("host_buf[TEST_POS_OLD] is correct.\n");
} else {
printf("host_buf[TEST_POS_OLD] is not correct!\n");
}
if (host_buf[TEST_POS_NEW] == NEW_VAL) {
printf("host_buf[TEST_POS_NEW] is correct.\n");
} else {
printf("host_buf[TEST_POS_NEW] is not correct!\n");
if (host_buf[TEST_POS_OLD] == host_buf[TEST_POS_NEW]) printf("They are equal!\n");
}
return 0;
fail:
err_name = cudaGetErrorName(res);
err_str = cudaGetErrorString(res);
printf("%s: %s\n", err_name, err_str);
return 1;
}
尽管评论另有说明,但您实际上是在两个图之间建立依赖关系。通过一些重构,我的建议是将这些活动组合成一个图,并使用图捕获中可用的机制来表达依赖关系。
然而目标:
我们可以以类似于提到的流备忘录的方式实现这一点(在评论中,仅在驱动程序 API 中可用)。基本上,我们创建一个等待设备内存位置的内核,以将一个图同步到另一个图。正在等待的图形将启动内核进行同步。另一个图形通过 memset 节点发送信号。
举个例子:
$ cat t2217.cu
#include <iostream>
#include <vector>
#include <cstdio>
#include <cstdlib>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void calc1kernel(float *data, float val, size_t n){
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
while (idx < n){
data[idx] += val;
idx += gridDim.x*blockDim.x;}
}
__global__ void calc2kernel(float *data, float val, size_t n){
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
while (idx < n){
data[idx] *= val;
idx += gridDim.x*blockDim.x;}
}
__global__ void waitkernel(volatile unsigned char *signal, unsigned char val){
while (*signal == val);
}
// CUDA Graph 1:
// calc1kernelnode
// |
// memsetnode
// CUDA Graph 2:
// waitkernel
// |
// calc2kernelnode
int main(int argc, char *argv[]){
size_t data_size = 32;
cudaStream_t s1, s2;
cudaGraph_t g1, g2;
float *data, val;
unsigned char *sig;
// allocate for data on the device
cudaMalloc(&data, data_size*sizeof(data[0]));
cudaCheckErrors("CUDAMalloc failure");
cudaMalloc(&sig, sizeof(sig[0]));
cudaCheckErrors("CUDAMalloc failure");
cudaMemset(sig, 0, sizeof(sig[0]));
cudaCheckErrors("CUDAMemset failure");
cudaMemset(data, 0, data_size*sizeof(data[0]));
cudaCheckErrors("CUDAMemset failure");
// create the graph
cudaGraphCreate(&g1, 0);
cudaCheckErrors("CUDAGraphCreate failure");
cudaGraphCreate(&g2, 0);
cudaCheckErrors("CUDAGraphCreate failure");
cudaStreamCreate(&s1);
cudaCheckErrors("CUDAStreamCreate failure");
cudaStreamCreate(&s2);
cudaCheckErrors("CUDAStreamCreate failure");
dim3 grid(1,1,1);
dim3 block(1,1,1);
cudaGraphNode_t calc1kernelnode, calc2kernelnode, waitkernelnode, memsetnode;
// add nodes and their dependencies to the first graph
cudaKernelNodeParams kernelNodeParams = {0};
// first add calc1kernelnode, which has no dependencies
val = 3.0f;
memset(&kernelNodeParams, 0, sizeof(cudaKernelNodeParams));
void *kernelargs[3] = {(void *)&data, (void *)&val, (void *)&data_size};
kernelNodeParams.func = (void *)calc1kernel;
kernelNodeParams.gridDim = grid;
kernelNodeParams.blockDim = block;
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = kernelargs;
kernelNodeParams.extra = NULL;
cudaGraphAddKernelNode(&calc1kernelnode, g1, NULL, 0, &kernelNodeParams);
cudaCheckErrors("CUDAGraphAddKernelNode failure");
// now add the memsetnode, which has 1 dependency on calc1kernelnode
cudaMemsetParams memsetParams = {0};
memset(&memsetParams, 0, sizeof(cudaMemsetParams));
memsetParams.dst = sig;
memsetParams.elementSize = 1;
memsetParams.height = 1;
memsetParams.pitch = 1;
memsetParams.value = 1;
memsetParams.width = 1;
cudaGraphAddMemsetNode(&memsetnode, g1, &calc1kernelnode, 1, &memsetParams);
cudaCheckErrors("CUDAGraphAddMemsetNode failure");
// graph 1 is now defined, next step is to instantiate an executable version of it
size_t num_nodes = 0;
cudaGraphNode_t *nodes1 = NULL;
cudaGraphGetNodes(g1, nodes1, &num_nodes);
cudaCheckErrors("CUDAGraphGetNodes failure");
printf("graph 1 num nodes: %lu\n", num_nodes);
cudaGraphExec_t graphExec1, graphExec2;
cudaGraphInstantiate(&graphExec1, g1, NULL, NULL, 0);
cudaCheckErrors("CUDAGraphInstantiate failure");
// add nodes and their dependencies to the second graph
// first add waitkernelnode, which has no dependencies
unsigned char test = 0;
memset(&kernelNodeParams, 0, sizeof(cudaKernelNodeParams));
void *waitkernelargs[2] = {(void *) &sig, (void *) &test };
kernelNodeParams.func = (void *)waitkernel;
kernelNodeParams.gridDim = grid;
kernelNodeParams.blockDim = block;
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = waitkernelargs;
kernelNodeParams.extra = NULL;
cudaGraphAddKernelNode(&waitkernelnode, g2, NULL, 0, &kernelNodeParams);
cudaCheckErrors("CUDAGraphAddKernelNode failure");
// now add the calc2kernelnode, which has 1 dependency on waitkernelnode
memset(&kernelNodeParams, 0, sizeof(cudaKernelNodeParams));
kernelNodeParams.func = (void *)calc2kernel;
kernelNodeParams.gridDim = grid;
kernelNodeParams.blockDim = block;
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = kernelargs;
kernelNodeParams.extra = NULL;
cudaGraphAddKernelNode(&calc2kernelnode, g2, &waitkernelnode, 1, &kernelNodeParams);
cudaCheckErrors("CUDAGraphAddKernelNode failure");
// graph 2 is now defined, next step is to instantiate an executable version of it
cudaGraphNode_t *nodes2 = NULL;
cudaGraphGetNodes(g2, nodes2, &num_nodes);
cudaCheckErrors("CUDAGraphGetNodes failure");
printf("graph 2 num nodes: %lu\n", num_nodes);
cudaGraphInstantiate(&graphExec2, g2, NULL, NULL, 0);
cudaCheckErrors("CUDAGraphInstantiate failure");
// now launch the graphs
cudaGraphLaunch(graphExec2, s2);
cudaCheckErrors("CUDAGraphLaunch failure");
cudaGraphLaunch(graphExec1, s1);
cudaCheckErrors("CUDAGraphLaunch failure");
cudaStreamSynchronize(s1);
cudaCheckErrors("graph execution failure");
cudaStreamSynchronize(s2);
cudaCheckErrors("graph execution failure");
float *result = new float[data_size];
cudaMemcpy(result, data, data_size*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "result[0] = " << result[0] << std::endl;
// clean up
cudaFree(data);
cudaStreamDestroy(s1);
cudaGraphDestroy(g1);
cudaGraphExecDestroy(graphExec1);
cudaStreamDestroy(s2);
cudaGraphDestroy(g2);
cudaGraphExecDestroy(graphExec2);
}
$ nvcc -o t2217 t2217.cu
$ ./t2217
graph 1 num nodes: 2
graph 2 num nodes: 2
result[0] = 9
$
9的结果表明,即使图2先启动,它也成功地等待到图1中的同步点,然后才允许其计算内核运行。
给出的示例(在问题中)显示了运行时 API 的使用。
如果你想使用驱动程序 API,正如评论中已经指出的那样,应该可以通过 batched memops 使用 cuGraphAddBatchMemOpNode 直接执行此操作。
这种互锁如果使用不当会导致挂起和死锁。注意给出的各种警告:
警告:此API使用不当可能导致应用程序死锁。通过此 API 建立的同步顺序对 CUDA 不可见。由该 API 排序(甚至间接排序)的 CUDA 任务也应该具有用 CUDA 可见的依赖项(例如事件)表示的顺序。 ...