单个上下文中的多 GPU clCreateBuffer 失败

问题描述 投票:0回答:0

我有两个 Intel ARC A770 GPU,我看到了这种行为(附有测试代码)。

  1. 我正在尝试使用 clCreateBuffer 分配内存(每个 1 MB)。
  2. 我有两个 GPU 设备。
  3. 测试代码创建上下文,加载内核程序,构建它。
  4. 测试程序只创建一个上下文(并且可以为 1 个设备或 2 个设备创建上下文)。
  5. 当我在 1 个 GPU 模式下运行代码时,我能够分配内存(超过 10000 个分配)。但是当我在 2 GPU 模式下运行代码时,内存不足(1000 次分配后主机内存不足——大约 1 GB 内存)。 OpenCL 错误代码 -6.
  6. 如果我创建两个上下文,我就可以在两个设备上分配内存。

GPU 有 16 GB RAM,我的主机有 192 GB RAM。知道我做错了什么吗?构建代码的命令是:

gcc -D CL_TARGET_OPENCL_VERSION=220 -g -Wall -o OpenCLMulti OpenCLMulti.cpp -lOpenCL -lm

测试代码:OpenCLMulti.cpp

#include <sys/time.h>
#include <sys/sysinfo.h>
#include <sys/stat.h>
#include <assert.h>
#include <errno.h>
#include <math.h>
#include <getopt.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <CL/cl.h>
#include <clBLAS.h>

#define MAX_PLATFORMS 1
#define MAX_GPUS 2

cl_context context;
cl_program program;
cl_device_id devices[MAX_GPUS] = {0};
cl_platform_id platformId[MAX_PLATFORMS];

const char* kernelFileName = "OpenCLKernels.cl";

unsigned char* getKernelCode(const char* filename, size_t* psize) {
  FILE* fp;
  size_t ret, size;
  unsigned char* bCode;

  fp = fopen(filename, "rb");
  if (fp == NULL) {
    fprintf(stderr, "Could not open kernels source file: %s", filename);
    return NULL;
  }

  fseek(fp, 0, SEEK_END);
  size = ftell(fp);
  rewind(fp);

  bCode = (unsigned char*) malloc(size);

  if ((ret = fread(bCode, 1, size, fp)) != size) {
    fprintf(stderr, "Could not read %ld bytes, got %ld", ret, size);
    return NULL;
  }
  fclose(fp);
  *psize = size;
  return bCode;
}

const char* buildOptions = "-DOPENCL -D CL_TARGET_OPENCL_VERSION=220";

void contextCallback(const char* errInfo, const void* privateInfo, size_t cb, void* userData) {
  printf("contextCallback %s\n", errInfo);
}

cl_int setupGPU(int gpus, int* ngpus) {
  size_t size;
  cl_int err = CL_SUCCESS;
  char buffer[16384];
  cl_uint numPlatforms, ngpusOCL;
  unsigned char* bCode;

  err = clGetPlatformIDs(MAX_PLATFORMS, &platformId[0], &numPlatforms);
  if (err != CL_SUCCESS) {
    return err;
  }

  if (numPlatforms == 0) {
    fprintf(stderr, "No OpenCL platforms found\n");
    return -100;
  }

  if (numPlatforms > 1) {
    fprintf(stderr, "Found more than one OpenCL platform. Choosing first.\n");
  }

  err = clGetPlatformInfo(platformId[0], CL_PLATFORM_PROFILE, sizeof(buffer), buffer, &size);
  fprintf(stderr, "Platform profile: %s\n", buffer);

  err = clGetPlatformInfo(platformId[0], CL_PLATFORM_VERSION, sizeof(buffer), buffer, &size);
  fprintf(stderr, "Platform version: %s\n", buffer);

  err = clGetPlatformInfo(platformId[0], CL_PLATFORM_NAME, sizeof(buffer), buffer, &size);
  fprintf(stderr, "Name: %s\n", buffer);

  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platformId[0], 0};
  err = clGetDeviceIDs(platformId[0], CL_DEVICE_TYPE_GPU, MAX_GPUS, &devices[0], &ngpusOCL);
  if (err != CL_SUCCESS) {
    return err;
  }
  fprintf(stderr, "Found %d devices\n", ngpusOCL);


  bCode = getKernelCode(kernelFileName, &size);
  if (bCode == NULL) {
    fprintf(stderr, "Could not load kernel code\n");
    return -1;
  }

  context = clCreateContext(properties, gpus, devices, contextCallback, NULL, &err);
  if (err != CL_SUCCESS) {
    fprintf(stderr, "Could not create GPU context: %d\n", err);
    return err;
  }

  fprintf(stderr, "%d: OpenCL context created successfully\n", 0);

  program = clCreateProgramWithSource(context, 1, (const char**) &bCode, &size, &err);
  if (program == NULL) {
    fprintf(stderr, "Could not create program from file %s, error %d\n", kernelFileName, err);
    return err;
  }
  fprintf(stderr, "%d: Program loaded successfully\n", 0);

  err = clBuildProgram(program, gpus, devices, buildOptions, NULL, NULL);
  if (err != CL_SUCCESS) {
    fprintf(stderr, "%d: Program build failed %d\n", 0, err);
    size = 1024 * 1024 * 4;
    char* output = (char*) malloc(size);
    err = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(1024 * 1024 * 4), output, &size);
    if (err != CL_SUCCESS) {
      fprintf(stderr, "%d: Program build failed. Unable to get log. Error %d\n", 0, err);
      return err;
    }
    fprintf(stderr, "%d: %s\n", 0, output);
    free(output);
    return err;
  }
  free(bCode);

  fprintf(stderr, "OpenCL kernels built successfully\n");
  *ngpus = ngpusOCL;
  return err;
}

int main(int argc, char* argv[]) {
  int ngpus, dgpus;
  cl_int err;
  size_t asize;
  cl_mem mem;

  ngpus = 1;
  for (int i = 1; i < argc; i++) {
    if (strcmp(argv[i], "--gpus") == 0) {
      ngpus = atoi(argv[i + 1]);
      i++;
    }
  }
  err = setupGPU(ngpus, &dgpus);

  if (err != CL_SUCCESS) {
    exit(1);
  }

  if (dgpus < ngpus) {
    fprintf(stderr, "Detected %d GPUs, requested %d. Exiting\n", dgpus, ngpus);
    exit(1);
  }

  asize = 1 * 1024 * 1024L;
  for (int i = 0; i < 100000; i++) {
    mem = clCreateBuffer(context, CL_MEM_READ_WRITE, asize, NULL, &err);
    if (err != CL_SUCCESS) {
      fprintf(stderr, "Could not create memory, err %d", err);
      return err;
    }

    fprintf(stderr, "%d: Allocated %ld MB successfully\n", i, asize / (1024 * 1024));
  }

  exit(0);

  exit(0);
}

简单内核:OpenCLKernels.cl

__kernel void dAxpy(int elements, float alpha, __global float *xData, int xOffset, int incX, __global float *yData, int yOffset, int incY) {
  long idx;
  int threadId, blockIdx, blockIdy, blockDim, gridDim;

  threadId = get_local_id(0); blockIdx = get_group_id(0); blockIdy = get_group_id(1); blockDim = get_local_size(0); gridDim = get_num_groups(0);
  idx = (blockIdx + blockIdy * gridDim) * blockDim + threadId;
  if (idx < elements) {
    yData[yOffset + idx * incY] += alpha * xData[xOffset + idx * incX];
  }
}

上面的代码是:

一个使用单个上下文分配 1 MB 缓冲区的简单测试程序。当我在上下文中注册一个设备时,我能够分配超过 1000 个缓冲区(大约 1 GB 的内存)。当我在上下文中注册两个设备时,在第 1000 次分配时出现主机内存不足错误。我希望我应该能够分配更多的缓冲区。

opencl intel multi-gpu opencl-c
© www.soinside.com 2019 - 2024. All rights reserved.