#define _GNU_SOURCE
#include "matmul.h"
#include "util.h"
#define CL_TARGET_OPENCL_VERSION 120
#define CL_USE_DEPRECATED_OPENCL_2_2_APIS
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#define CHECK_ERROR(err) \
if (err != CL_SUCCESS) { \
printf("[%s:%d] OpenCL error %d\n", __FILE__, __LINE__, err); \
exit(EXIT_FAILURE); \
}
static cl_int err;
static cl_platform_id platform;
static cl_device_id device;
static cl_context context;
static cl_command_queue queue;
static cl_program program;
static cl_kernel kernel;
static cl_mem a_d, b_d, c_d;
void matmul(const float *A, const float *B, float *C, int M, int N, int K) {
for (int m=0; m<M; m++) {
for (int n=0; n<N; n++) {
float acc = 0.0f;
for (int k=0; k<K; k++) {
acc += A[k*M + m] * B[n*K + k];
}
C[n*M + m] = acc;
}
}
}
static void print_platform_info(cl_platform_id platform) {
size_t sz;
char *buf;
CHECK_ERROR(clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &sz));
buf = (char *)malloc(sz);
CHECK_ERROR(clGetPlatformInfo(platform, CL_PLATFORM_NAME, sz, buf, NULL));
printf("Detected OpenCL platform: %s\n", buf);
free(buf);
}
static void print_device_info(cl_device_id device) {
size_t sz;
char *buf;
CHECK_ERROR(clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &sz));
buf = (char *)malloc(sz);
CHECK_ERROR(clGetDeviceInfo(device, CL_DEVICE_NAME, sz, buf, NULL));
printf("Detected OpenCL device: %s\n", buf);
free(buf);
}
static cl_program create_and_build_program_with_source(cl_context context,
cl_device_id device,
const char *file_name) {
FILE *file = fopen(file_name, "rb");
if (file == NULL) {
printf("Failed to open %s\n", file_name);
exit(EXIT_FAILURE);
}
fseek(file, 0, SEEK_END);
size_t source_size = ftell(file);
rewind(file);
char *source_code = (char *)malloc(source_size + 1);
size_t ntotal = 0;
while (ntotal < source_size) {
int nread = fread(source_code, sizeof(char), source_size, file);
ntotal += nread;
}
source_code[source_size] = '\0';
fclose(file);
cl_program program = clCreateProgramWithSource(
context, 1, (const char **)&source_code, &source_size, &err);
CHECK_ERROR(err);
free(source_code);
err = clBuildProgram(program, 1, &device, "", NULL, NULL);
if (err == CL_BUILD_PROGRAM_FAILURE) {
size_t log_size;
CHECK_ERROR(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0,
NULL, &log_size));
char *log = (char *)malloc(log_size + 1);
CHECK_ERROR(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
log_size, log, NULL));
log[log_size] = 0;
printf("Compile error:\n%s\n", log);
free(log);
}
CHECK_ERROR(err);
return program;
}
void matmul_initialize(int M, int N, int K) {
// Get OpenCL platform
err = clGetPlatformIDs(1, &platform, NULL);
CHECK_ERROR(err);
print_platform_info(platform);
// Get OpenCL device (only 1)
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
CHECK_ERROR(err);
print_device_info(device);
// Create OpenCL context
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CHECK_ERROR(err);
// Create OpenCL command queue
queue = clCreateCommandQueue(context, device, 0, &err);
CHECK_ERROR(err);
// Compile program from "kernel.cl"
program = create_and_build_program_with_source(context, device, "kernel.cl");
// Extract kernel from compiled program
kernel = clCreateKernel(program, "sgemm", &err);
CHECK_ERROR(err);
// Create GPU buffers
a_d = clCreateBuffer(context, CL_MEM_READ_WRITE, M * K * sizeof(float), NULL,
&err);
CHECK_ERROR(err);
b_d = clCreateBuffer(context, CL_MEM_READ_WRITE, K * N * sizeof(float), NULL,
&err);
CHECK_ERROR(err);
c_d = clCreateBuffer(context, CL_MEM_READ_WRITE, M * N * sizeof(float), NULL,
&err);
CHECK_ERROR(err);
}
void matmul_finalize() {
clReleaseMemObject(a_d);
clReleaseMemObject(b_d);
clReleaseMemObject(c_d);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseProgram(program);
clReleaseKernel(kernel);
}
这个是matmul.c,
#include "util.h"
#include <math.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
static double start_time[8];
void timer_init() { srand(time(NULL)); }
static double get_time() {
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
return tv.tv_sec + tv.tv_nsec * 1e-9;
}
void timer_start(int i) { start_time[i] = get_time(); }
double timer_stop(int i) { return get_time() - start_time[i]; }
void alloc_mat(float **m, int R, int S) {
*m = (float *)aligned_alloc(32, sizeof(float) * R * S);
if (*m == NULL) {
printf("Failed to allocate memory for mat.\n");
exit(0);
}
}
void rand_mat(float *m, int R, int S) {
int N = R * S;
for (int j = 0; j < N; j++) {
m[j] = (float)rand() / RAND_MAX - 0.5;
}
}
void zero_mat(float *m, int R, int S) {
int N = R * S;
memset(m, 0, sizeof(float) * N);
}
void print_mat(float *m, int M, int N) {
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
printf("%+.3f ", m[i * N + j]);
}
printf("\n");
}
}
void check_mat_mul(float *A, float *B, float *C, int M, int N, int K) {
printf("Validating...\n");
float *C_ans;
alloc_mat(&C_ans, M, N);
zero_mat(C_ans, M, N);
#pragma omp parallel for
for (int i = 0; i < M; ++i) {
for (int k = 0; k < K; ++k) {
for (int j = 0; j < N; ++j) {
C_ans[i * N + j] += A[i * K + k] * B[k * N + j];
}
}
}
bool is_valid = true;
int cnt = 0, thr = 10;
float eps = 1e-3;
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
float c = C[i * N + j];
float c_ans = C_ans[i * N + j];
if (fabsf(c - c_ans) > eps &&
(c_ans == 0 || fabsf((c - c_ans) / c_ans) > eps)) {
++cnt;
if (cnt <= thr)
printf("C[%d][%d] : correct_value = %f, your_value = %f\n", i, j,
c_ans, c);
if (cnt == thr + 1)
printf("Too many error, only first %d values are printed.\n", thr);
is_valid = false;
}
}
}
if (is_valid) {
printf("Result: VALID\n");
} else {
printf("Result: INVALID\n");
}
}
这个是utils.c,
#include <getopt.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "matmul.h"
#include "util.h"
static void print_help(const char *prog_name) {
printf("Usage: %s [-pvh] [-n num_iterations] M N K\n", prog_name);
printf("Options:\n");
printf(" -p : print matrix. (default: off)\n");
printf(" -v : validate matrix multiplication. (default: off)\n");
printf(" -h : print this page.\n");
printf(" -n : number of iterations (default: 1)\n");
printf(" M : number of rows of matrix A and C. (default: 8)\n");
printf(" N : number of columns of matrix B and C. (default: 8)\n");
printf(
" K : number of columns of matrix A and rows of B. (default: 8)\n");
}
static bool print_data = false;
static bool validation = false;
static int M = 8;
static int N = 8;
static int K = 8;
static int num_iterations = 1;
static void parse_opt(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "pvht:n:m:")) != -1) {
switch (c) {
case 'p':
print_data = true;
break;
case 'v':
validation = true;
break;
case 'n':
num_iterations = atoi(optarg);
break;
case 'h':
default:
print_help(argv[0]);
exit(0);
}
}
for (int i = optind, j = 0; i < argc; ++i, ++j) {
switch (j) {
case 0:
M = atoi(argv[i]);
break;
case 1:
N = atoi(argv[i]);
break;
case 2:
K = atoi(argv[i]);
break;
default:
break;
}
}
printf("Options:\n");
printf(" Problem size: M = %d, N = %d, K = %d\n", M, N, K);
printf(" Number of iterations: %d\n", num_iterations);
printf(" Print matrix: %s\n", print_data ? "on" : "off");
printf(" Validation: %s\n", validation ? "on" : "off");
printf("\n");
}
int main(int argc, char **argv) {
parse_opt(argc, argv);
printf("Initializing... ");
fflush(stdout);
float *A, *B, *C;
// Initialize random seed
timer_init();
// Allocate matrices
alloc_mat(&A, M, K);
alloc_mat(&B, K, N);
alloc_mat(&C, M, N);
// Set each element to a random value
rand_mat(A, M, K);
rand_mat(B, K, N);
printf("done!\n");
// Initialize OpenCL
printf("Initializing OpenCL...\n");
fflush(stdout);
matmul_initialize(M, N, K);
// Few warmup iterations
zero_mat(C, M, N);
for (int i = 0; i < 3; i++) {
matmul(A, B, C, M, N, K);
}
double elapsed_time_sum = 0;
for (int i = 0; i < num_iterations; ++i) {
printf("Calculating...(iter=%d) ", i);
fflush(stdout);
zero_mat(C, M, N);
timer_start(0);
matmul(A, B, C, M, N, K);
double elapsed_time = timer_stop(0);
printf("%f sec\n", elapsed_time);
elapsed_time_sum += elapsed_time;
}
if (print_data) {
printf("MATRIX A:\n");
print_mat(A, M, K);
printf("MATRIX B:\n");
print_mat(B, K, N);
printf("MATRIX C:\n");
print_mat(C, M, N);
}
// Finalize OpenCL
matmul_finalize();
if (validation) {
check_mat_mul(A, B, C, M, N, K);
}
double elapsed_time_avg = elapsed_time_sum / num_iterations;
printf("Avg. time: %f sec\n", elapsed_time_avg);
printf("Avg. throughput: %f GFLOPS\n",
2.0 * M * N * K / elapsed_time_avg / 1e9);
return 0;
}
这是主要的。c
我通过
运行这段代码#!/bin/bash
srun --nodes=1 --exclusive --partition=class1 --gres=gpu:4 ./main $@
在 GPU 集群服务器上,但它给了我
bash:./run_performance.sh:权限被拒绝
这个错误。
我认为GPU的访问权限没有问题。
我可以解决这个问题吗?
我不确定但是
#define CL_TARGET_OPENCL_VERSION 120
#define CL_USE_DEPRECATED_OPENCL_2_2_APIS
这个版本的问题似乎是问题。
这是服务器的GPU信息
Number of platforms 1
Platform Name NVIDIA CUDA
Platform Vendor NVIDIA Corporation
Platform Version OpenCL 3.0 CUDA 11.8.88
Platform Profile FULL_PROFILE
Platform Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts cl_nv_create_buffer cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_device_uuid cl_khr_pci_bus_info cl_khr_external_semaphore cl_khr_external_memory cl_khr_external_semaphore_opaque_fd cl_khr_external_memory_opaque_fd
Platform Host timer resolution 0ns
Platform Extensions function suffix NV
Platform Name NVIDIA CUDA
Number of devices 1
Device Name NVIDIA TITAN RTX
Device Vendor NVIDIA Corporation
Device Vendor ID 0x10de
Device Version OpenCL 3.0 CUDA
Driver Version 520.61.05
Device OpenCL C Version OpenCL C 1.2
Device Type GPU
更改 matmul.c 的 #define 部分的版本和 API 版本
看起来
run_performance.sh
没有执行文件权限。在Linux中,每个文件都有单独的读/写/执行权限,并且默认情况下执行权限是关闭的,以避免文件的意外执行和对系统的潜在危害。
就您而言,您希望该文件可执行。要解决这个问题,请运行:
chmod +x run_performance.sh
之后你可以执行它:
./run_performance.sh