关于这个问题还有其他类似的问题,但他们的答案并不能解决我的问题:
terminate called after throwing an instance of 'sycl::_V1::runtime_error' what(): No kernel named _ZTSZ7gpu_CUBEUlvE_ was found -46 (PI_ERROR_INVALID_KERNEL_NAME)
我有一个 Fortran 代码,它使用包装器来调用 CUDA-C 函数。现在我正在将 CUDA 代码转换为 SYCL 代码(以便在 NVIDIA GPU 中工作)。我也相应地修改了Makefile,所以我使用Clang++来编译SYCL。该错误似乎与链接阶段有关。我正在使用 ifort 编译器进行链接(ifort 管理一些 MPI 依赖项,而 clang++ 不能)。
我单独测试了SYCL代码,效果很好。问题在于将其链接到 Fortran 项目时。您可以通过以下方式重现错误:
runfom.f:
program runfom
use MPI
include 'CUB.f'
call gpu_init()
call gpu_CUB()
end program runfom
CUB.f:
USE ISO_C_BINDING
implicit none
INTERFACE
SUBROUTINE gpu_init() bind(C, name="gpu_init")
END SUBROUTINE
SUBROUTINE gpu_CUB() bind(C, name="gpu_CUB")
END SUBROUTINE
END INTERFACE
CUB_kernel.cpp:
#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sched.h>
#include <cmath>
extern "C"
{
void gpu_init()
{
auto platforms = sycl::platform::get_platforms();
printf("\n \n Checking all devices in the Node: \n");
for (const auto & platform: platforms){
auto devices = platform.get_devices();
for (const auto& device : devices) {
std::string name = device.get_info<sycl::info::device::name>();
std::string vendor = device.get_info<sycl::info::device::vendor>();
// Print device information
std::cout << "Name: " << name << std::endl;
std::cout << "Vendor: " << vendor << std::endl;
}
}
}
void gpu_CUB(void) {
const int N=16;
sycl::device dev_ct1;
sycl::queue q_ct1(
dev_ct1, sycl::property_list{sycl::property::queue::in_order()});
//# Initialize vectors on host
float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
float B[N] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
float C[N] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
//# Allocate memory on device
float *d_A, *d_B, *d_C;
d_A = sycl::malloc_device<float>(N, q_ct1);
d_B = sycl::malloc_device<float>(N, q_ct1);
d_C = sycl::malloc_device<float>(N, q_ct1);
//# copy vector data from host to device
q_ct1.memcpy(d_A, A, N * sizeof(float));
q_ct1.memcpy(d_B, B, N * sizeof(float));
q_ct1.single_task<>([=]() {d_C[0]=1.9;} );
//# copy result of vector data from device to host
q_ct1.memcpy(C, d_C, N * sizeof(float)).wait();
//# print result on host
for (int i = 0; i < N; i++) std::cout<< C[i] << " ";
std::cout << "\n";
//# free allocation on device
sycl::free(d_A, q_ct1);
sycl::free(d_B, q_ct1);
sycl::free(d_C, q_ct1);
}
}
生成文件:
FCOMPILE = ftn
SYCLCOMPILE = /mypath/intel/oneapi/compiler/2024.0/bin/compiler/clang++
CCOMPILE = /mypath/intel/oneapi/compiler/2024.0/bin/compiler/clang
DEFS = -DNVIDIA_GPU
FLAGS = $(DEFS) -fast
CFLAGS = $(DEFS)
NVFLAGS = $(DEFS) -fsycl -fsycl-targets=nvptx64-nvidia-cuda -I./ISO_Fortran_binding/include
FFLAGS= $(GENCMPLFLAGS) $(TARGET_ARCH) -cpp
F90FLAGS= $(GENCMPLFLAGS) $(TARGET_ARCH) -cpp
DEPS=CUB.f
LDLIBS = -L/mypath/intel/oneapi/compiler/2024.0/lib -lsycl -lstdc++
COMPILE.c = $(CCOMPILE) $(CFLAGS) -c
COMPILE.f = $(FCOMPILE) $(FFLAGS) -c
COMPILE.cpp = $(SYCLCOMPILE) $(NVFLAGS) -c
COMPILE.f90 = $(FCOMPILE) $(F90FLAGS) -c
LINK.f = $(FCOMPILE) $(FFLAGS)
# command to compile (not link):
%.o: %.c $(DEPS)
$(COMPILE.c) -o $@ $<
%.o: %.cpp $(DEPS)
$(COMPILE.cpp) -o $@ $<
# command to compile (not link):
%.o: %.f90 $(DEPS)
$(COMPILE.f90) -o $@ $<
# command to compile (not link):
%.o: %.f $(DEPS)
$(COMPILE.f) -o $@ $<
# command to link object files:
%: %.o $(DEPS)
$(LINK.f) $^ $(LDLIBS) -o $@
################################################################
#
# build rule
#
################################################################
runfom: runfom.o CUB_kernel.o
$(LINK.f) $^ $(LDLIBS) -o $@
clean:
rm -f runfom *.o
其中 ftn 是 ifort+MPI 依赖项的包装器。
我用
SYCL_DEVICE_FILTER=cuda SYCL_PI_TRACE=1 ./runfom
执行,得到以下输出:
SYCL_PI_TRACE[basic]: Plugin found and successfully loaded: libpi_cuda.so [ PluginVersion: 14.38.1 ]
SYCL_PI_TRACE[basic]: Plugin found and successfully loaded: libpi_unified_runtime.so [ PluginVersion: 14.37.1 ]
Checking all devices in the Node:
Name: NVIDIA A100-PCIE-40GB
Vendor: NVIDIA Corporation
SYCL_PI_TRACE[all]: Requested device_type: info::device_type::automatic
SYCL_PI_TRACE[all]: Selected device: -> final score = 500
SYCL_PI_TRACE[all]: platform: NVIDIA CUDA BACKEND
SYCL_PI_TRACE[all]: device: NVIDIA A100-PCIE-40GB
terminate called after throwing an instance of 'sycl::_V1::runtime_error'
what(): No kernel named _ZTSZ7gpu_CUBEUlvE_ was found -46 (PI_ERROR_INVALID_KERNEL_NAME)
我尝试过使用 gfortran,并且在链接语句(在 Makefile 中)中使用了标志顺序,但仍然如此。
有什么想法吗?
您对链接的看法是正确的 - 您也需要在链接阶段使用 icpx (或 clang++)。