一段时间以来,我一直在尝试并行执行高斯消除过程。似乎内核正在忽略设置的障碍,执行所有可能的操作,然后让下一个内核执行其工作。但是我需要他们反复合作。我的输入A是修改后的矩阵,最后一列是图形化的“输出”。换句话说:每个内核在单独的第j行上执行行缩减,以使第i列中的元素为零。屏障。冲洗并重复。最后将有一个单位矩阵。执行第j行的相同内核还将x [j]分配给最后一列中的值。
主程序创建一个以2D形式组织的指定矩阵,然后使其成为内核的1D矩阵。它自行解决串行问题,并将串行结果与传递回的opencl结果进行比较。
主要:
// Entry point
int main(){
// Initialize OpenCL.
if(!init_opencl()){return -1;}
// Request user for size of parallel operation
kr=k_input();
// Initialize the problem data. Requires number of devices to be known
init_problem();
// Run the kernel.
run();
// Free the resources allocated
cleanup();
return 0;
}
// Initializes the OpenCL objects
bool init_opencl(){
cl_int status;
printf("Initializing OpenCL\n");
if(!setCwdToExeDir()) {
return false; }
// Get the OpenCL platform.
platform = findPlatform("Intel"); // CHANGE TO Altera FOR FINAL
if(platform == NULL) {
printf("ERROR: Unable to find Intel OpenCL platform.\n");
return false; }
// Query the available OpenCL device.
device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
printf("Platform: %s\n", getPlatformName(platform).c_str());
printf("Using %d device(s)\n", num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
printf(" %s\n\n", getDeviceName(device[i]).c_str()); // ANOTHER LINE FOR PRINTF DEBUGGING
}
// Create the context.
context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
checkError(status, "Failed to create context");
// Create the program for all device. Use the first device as the
// representative device (assuming all device are of the same type).
std::string binary_file = getBoardBinaryFile("GaussianEl", device[0]);
printf("Using AOCX: %s\n", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
// Build the program that was just created.
status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
checkError(status, "Failed to build program");
// Create per-device objects.
queue.reset(num_devices);
kernel.reset(num_devices);
n_per_device.reset(num_devices);
input_a_buf.reset(num_devices);
input_b_buf.reset(num_devices);
output_buf.reset(num_devices);
n = Make_Matrix(A);N=n;
for (int j = 0; j < n ; j++)
{
for (int k = 0 ; k < n+1 ; k++)
{
parallelA[k+j*(1+n)]=A[j][k];
}
}
for(unsigned i = 0; i < num_devices; ++i) {
// Command queue.
queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, "Failed to create command queue");
// Kernel.
const char *kernel_name = "GaussianEl";
kernel[i] = clCreateKernel(program, kernel_name, &status);
checkError(status, "Failed to create kernel");
// Determine the number of elements processed by this device.
n_per_device[i] = N / num_devices; // Maybe change to n ?
// Spread out the remainder of the elements over the first
// N % num_devices.
if(i < (N % num_devices)) {
n_per_device[i]++; }
// Input buffers. // A, x, n
input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_per_device[i]*(n_per_device[i]+1)*sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input A");
input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(int), NULL, &status);
checkError(status, "Failed to create buffer for input B");
// Output buffer.
output_buf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for output");
}
return true;
}
// Request value for k
long int k_input(){
int ch;
char *p;
char buffer[16];
while ( ( ch=getchar() ) != '\n' && (ch != EOF) ){} // clear input stream
printf ( "In base 2, ranging from 1 to 1024, enter the number of parallel threads to execute: " );
INPUT:
if( fgets( buffer,sizeof(buffer),stdin ) != NULL ) // waits for input
{
long int kr = strtol(buffer,&p,10);
if( buffer[0] != '\n' && (*p == '\n' || *p == '\0'))
{
float x = log2(kr);
if( fmodf(x,1) == 0 )
{
if( kr<=1024 )
{
printf("%ld kernals will be launched, good job!\n\n",kr);
return kr;
}
else
{
printf("Too many kernals, must be from 1 to 1024.\n");
goto INPUT;
}
}
else
printf("%ld is not a power of 2, try again..\n",kr);
goto INPUT;
}
else //
{
for( int i = 0;i<strlen(buffer);i++ )
{
if( buffer[i]=='\n' )
buffer[i]='\0';
}
printf("Received %ld. Invalid input: \"%s\". Fatal Error.\n", kr, p);
return 0;
}
}
else // fgets is NULL due to error or EOF condition
printf ( "Error reading input.\n" );
return 0;
}
// Initialize the data for the problem. Requires num_devices to be known
void init_problem(){
if(num_devices == 0) {
checkError(-1, "No devices"); }
input_a.reset(num_devices);
input_b.reset(num_devices);
output.reset(num_devices);
ref_output.reset(num_devices);
// Generate input vectors A and B and the reference output consisting
// of a total of N elements.
// We create separate arrays for each device so that each device has an
// aligned buffer.
for(unsigned i = 0; i < num_devices; ++i) {
input_a[i].reset(n_per_device[i]*(n_per_device[i]+1));
input_b[i].reset(1);
output[i].reset(n_per_device[i]);
ref_output[i].reset(n_per_device[i]);
input_a[i]=parallelA;
input_b[i][0]=n;
// Serial Code
start_time = getCurrentTimestamp();
Solve_Matrix(n,A,x);
ref_output[i]=x; // ref_output[1]
end_time = getCurrentTimestamp();
}
Print_Solution(n,x);
}
// Run kernel ///////////////////////////////////
void run(){
cl_int status;
// Launch the problem for each device.
scoped_array<cl_event> kernel_event(num_devices);
scoped_array<cl_event> finish_event(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Transfer inputs to each device. Each of the host buffers supplied to
// clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
// for the host-to-device transfer.
//status = clEnqueueMapBuffer(queue[i],input_a_buf[i], CL_TRUE, ,
// 0, n_per_device[i]*(n_per_device[i]+1)*sizeof(float), 0, NULL, &write_event[0]);
cl_event write_event[2];
status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_TRUE,
0, n_per_device[i]*(n_per_device[i]+1)*sizeof(float), input_a[i], 0, NULL, &write_event[0]);
checkError(status, "Failed to transfer input A");
// WRITTEN FROM input_a // CL_FALSE
status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
0, sizeof(int), input_b[i], 0, NULL, &write_event[1]);
checkError(status, "Failed to transfer input B");
// WRITTEN FROM input_b
// Set kernel arguments.
unsigned argi = 0;
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
// Enqueue kernel.
// Use a global work size corresponding to the number of elements to add
// for this device.
//
// We don't specify a local work size and let the runtime choose
// (it'll choose to use one work-group with the same size as the global
// work-size).
//
// Events are used to ensure that the kernel is not launched until
// the writes to the input buffers have completed.
const size_t global_work_size = kr; // kr = (1024, 512, 256, etc)
printf("\nLaunching for device %d (%lu elements)\n", i, global_work_size);
status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
&global_work_size, NULL, 2, write_event, &kernel_event[i]);
checkError(status, "Failed to launch kernel");
// Read the result. This the final operation.
status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
0, n_per_device[i]*sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
// READ INTO output WHICH IS parallelx
// Release local events.
clReleaseEvent(write_event[0]);
clReleaseEvent(write_event[1]);
}
// Wait for all devices to finish.
clWaitForEvents(num_devices, finish_event);
//parallelx=*output[1]; /////////////////////////////////// not required for pass
//Print_Solution(n,parallelx);
// Wall-clock time taken.
Serial_Time = (end_time - start_time) * 1e3;
printf("\nSerial Time: %0.3f ms\n", Serial_Time);
// Get kernel times using the OpenCL event profiling API.
for(unsigned i = 0; i < num_devices; ++i) {
cl_ulong time_ns = getStartEndTime(kernel_event[i]);
printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6);
printf("Speed-up is: %0.3f", Serial_Time/(double(time_ns)*1e-6));
}
// Release all events.
for(unsigned i = 0; i < num_devices; ++i) {
clReleaseEvent(kernel_event[i]);
clReleaseEvent(finish_event[i]);
}
// Verify results.
bool pass = true;
for(unsigned i = 0; i < num_devices && pass; ++i) {
for(unsigned j = 0; j < n_per_device[i] && pass; ++j) {
if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) {
printf("Failed verification @ device %d, index %d\nOutput: %f\nReference: %f\n",
i, j, output[i][j], ref_output[i][j]);
pass = false;
}
}
}
printf("\nVerification: %s\n", pass ? "PASS" : "FAIL");
}
// Free the resources allocated during initialization
void cleanup(){
for(unsigned i = 0; i < num_devices; ++i) {
if(kernel && kernel[i]) {
clReleaseKernel(kernel[i]);
}
if(queue && queue[i]) {
clReleaseCommandQueue(queue[i]);
}
if(input_a_buf && input_a_buf[i]) {
clReleaseMemObject(input_a_buf[i]);
}
if(input_b_buf && input_b_buf[i]) {
clReleaseMemObject(input_b_buf[i]);
}
if(output_buf && output_buf[i]) {
clReleaseMemObject(output_buf[i]);
}
}
if(program) {
clReleaseProgram(program);
}
if(context) {
clReleaseContext(context);
}
}
内核:
__kernel void GaussianEl(__global float *A,
__global int *y,
__global float *restrict z)
{
unsigned index = get_global_id(0); // get index of the work item
unsigned kr = get_global_size(0); // number of global work items
unsigned n = *y; // dimension of matrix
float scale; // initialize scale
int a,b,i,j,k,h; // variables
if(index==0) { // Show received info is good
printf("----------------------\nThe dimension size is: %d\nThe kr is: %d\nThe original matrix is:\n",n,kr); //
print_matrix(n, index, A);
}
for(i=0; i<n; i++) //* LOOP FOR UPPER TRIANGULAR MATRIX
{
if( A[i+i*(n+1)] == 0 ){printf("Error, divide by zero encountered.\n");} // do something for if divide by zero for re-calculating matrix?
for(j=0; j<n; j++) // THIS FOR REPRESENTS THE jTH ROW OF THE iTH COLUMN PARALLELIZED
{ // BARRIER MUST BE PLACED AFTER THE iTH COLUMN ELEMENT IN A ROW IS ZEROED
h=j/kr; // h=1/2=0, h=2/2=1 E.G VALUES FOR 3X3 KR=2
if(j==i && index==(j-h*kr))
{
scale=A[i+i*(n+1)];
printf("kernel=%d, i=%d, j=%d, h=%d, scale=%.1f/%.1f, Value=%d\n", index, i, j, h, A[i+j*(n+1)], A[i+i*(n+1)],(j-h*kr) );
for(k=i; k<=n; k++)
{
A[k+j*(n+1)]=A[k+j*(n+1)]/scale;
}
print_matrix(n, index, A);
}
if( (j>i) && (index==(j-h*kr)) )
{
scale=A[i+j*(n+1)]/A[i+i*(n+1)];
printf("kernel=%d, i=%d, j=%d, h=%d, scale=%.1f/%.1f, Value=%d\n", index, i, j, h, A[i+j*(n+1)], A[i+i*(n+1)],(j-h*kr) );
for(k=0; k<=n; k++)
{
A[k+j*(n+1)]=A[k+j*(n+1)]-scale*A[k+i*(n+1)];
}
print_matrix(n, index, A);
}
}
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
}
... // 1 more loop after this to finish upper half
结果:
The serial/parallel solution is:
x0=3.0
x1=4.0
x2=-2.0
Launching for device 0 (2 elements)
----------------------
The dimension size is: 3
The kr is: 2
The original matrix is:
1.0 1.0 1.0 5.0
2.0 3.0 5.0 8.0
4.0 0.0 5.0 2.0
kernel=0, i=0, j=0, h=0, scale=1.0/1.0, Value=0
1.0 1.0 1.0 5.0
2.0 3.0 5.0 8.0
4.0 0.0 5.0 2.0
kernel=0, i=0, j=2, h=1, scale=4.0/1.0, Value=0
1.0 1.0 1.0 5.0
2.0 3.0 5.0 8.0
0.0 -4.0 1.0 -18.0
kernel=0, i=1, j=2, h=1, scale=-4.0/3.0, Value=0
1.0 1.0 1.0 5.0
2.0 3.0 5.0 8.0
2.7 0.0 7.7 -7.3
kernel=0, i=2, j=2, h=1, scale=7.7/7.7, Value=0
1.0 1.0 1.0 5.0
2.0 3.0 5.0 8.0
2.7 0.0 1.0 -1.0
..and now the rest...
test 1
test 2: i=0,j=0, h=0
test 2: i=0,j=1, h=0
test 2: i=0,j=2, h=1
test 1
test 2: i=1,j=0, h=0
kernel=0, i=1, j=0, h=0, scale=1.0/3.0, Value=0
0.3 0.0 -0.7 2.3
2.0 3.0 5.0 8.0
2.7 0.0 1.0 -1.0
test 2: i=1,j=1, h=0
test 2: i=1,j=2, h=1
test 1
test 2: i=2,j=0, h=0
kernel=0, i=2, j=0, h=0, scale=-0.7/1.0, Value=0
2.1 0.0 0.0 1.7
2.0 3.0 5.0 8.0
2.7 0.0 1.0 -1.0
test 2: i=2,j=1, h=0
test 2: i=2,j=2, h=1
The value of x0 is: 1.7
The value of x1 is: 8.0
The value of x2 is: -1.0
kernel=1, i=0, j=1, h=0, scale=2.0/2.1, Value=1
kernel=1, i=1, j=1, h=0, scale=3.0/3.0, Value=1
test 1
test 2: i=0,j=0, h=0
test 2: i=0,j=1, h=0
test 2: i=0,j=2, h=1
test 1
test 2: i=1,j=0, h=0
test 2: i=1,j=1, h=0
test 2: i=1,j=2, h=1
test 1
test 2: i=2,j=0, h=0
test 2: i=2,j=1, h=0
test 2: i=2,j=2, h=1
The value of x0 is: 1.7
The value of x1 is: 2.1
The value of x2 is: -1.0
Serial Time: 0.570 ms
Kernel time (device 0): 1.442 ms
Speed-up is: 0.395Failed verification @ device 0, index 0
Output: 1.695652
Reference: 3.000000
Verification: FAIL
该问题似乎发生,因为我正在使用API并以与以前的实验相同的方式调度内核。问题在于,在以前的实验中,内核执行了各自的部分并返回了。当我在内核代码中调试打印语句时,命令窗口将一个接一个地显示它们。那很好,并且有效,我可以看到它是如何工作的,因为我可以看到加速(当我将代码上传到Altera板上时,而不是在模拟课程期间)。但是,现在我需要在内核完成之前以全局方式返回结果,这需要多次发生,并且所有内核都必须能够将此更改读取到其他内核执行的矩阵中。我一直在鬼混clEnqueueMapBuffer,但是我什至不确定这是否是正确的API或用替换clEnqueueWriteBuffer的API。今晚我将按照我的教授的要求删除这个问题。
首先,您应该从一开始就提到它与Altera FPGA有关,因为OpenCL在这里的工作方式与在GPU上有所不同。
[根据我的经验,特别是在使用FPGA时,您需要逐步进行。因此:
这几乎是我为Altera FPGA开发内核时所记得的。