这是使用openmp指令进行编辑以提高执行速度并且低于程序所使用文件结构的c代码。我尝试使用:
#pragma omp parallel for reduction (+: hn_out, y_out) private (k,g) shared (y_out_avg, y_exp_avg)
但是它不起作用,预期结果是错误的,并且与串行结果不同。我认为并行化存在逻辑错误,我的意思是该算法必须以另一种方式并行化。
// FEEDFORWARD AND BACKPROPAGATION ALGORITHM
// WITH IMPLEMENTAtION OF BATCH TECHNIQUE
// compute the error in a batch of 5 input and then propagate the error, usefull for the parallelization.
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#define INPUTN 3 // number of neurons in the input layer
#define HN 3 // number of neurons in the hidden layer
#define OUTN 1 // number of neurons in the output layer
#define DATANUM 1000 // number of training samples
#define EPOCHS 1000
#define BATCH_SIZE 20
typedef struct DataS{
double input[INPUTN];
double teach;
}DataS;
int main(){
double alpha = 0.0000001; //learning rate
double hn_out[HN];
double price_M;
double y_out = 0.0;
double error; //loss function
int k,g;
double delta_y;
double delta_w[HN][INPUTN];
double delta_b[HN];
DataS data[DATANUM];
double w[HN][INPUTN];
double v[HN];
double b[HN];
FILE *fp1;
double relative_err = 0;
double y_avg = 0.0;
double y_out_avg = 0.0;
double y_exp_avg = 0.0;
//weights initialization
for(int i=0; i<HN; i++){
v[i]= 1.0;
for(int j=0; j<INPUTN; j++)
w[i][j]= 1.0;
b[i]=0.0;
}
//get Dataset
fp1 = fopen("Dataset_3.txt", "r");
if(fp1 == NULL)
{
printf("cannot open file");
exit(1);
}
for(int i=0;i<DATANUM; i++){
fscanf(fp1, "%lf\t%lf\t%lf\t%lf", &data[i].input[0], &data[i].input[1], &data[i].input[2], &data[i].teach);
printf("%lf\t%lf\t%lf\t%lf\n", data[i].input[0], data[i].input[1], data[i].input[2], data[i].teach);
y_avg += data[i].teach/DATANUM;
}
fclose(fp1);
//START ALGORITHM
double ti = omp_get_wtime(); //initial time
for (int i = 0; i < EPOCHS; i ++) {
printf("\nepoch %d) ", i);
relative_err=0;
#pragma omp parallel for reduction (+: hn_out, y_out) private (k,g) shared (y_out_avg, y_exp_avg)
for(int j=0; j<DATANUM/BATCH_SIZE; j++){
//FEEDFORWARD
//compute hn_out[HN]
int base = j*BATCH_SIZE;
printf("Avg of data:");
for(int i_b=0; i_b<BATCH_SIZE; i_b++){
printf(" %d", base+i_b);
for(k=0; k<HN; k++){
hn_out[k]= 0.0;
}
for(k=0; k<HN; k++){
for(g=0; g<INPUTN; g++){
hn_out[k]+= w[k][g]*data[base+i_b].input[g];
}
hn_out[k]+= b[k];
}
//compute y_out[OUTN]
y_out= 0.0;
for(g=0; g<HN; g++){
y_out += hn_out[g]*v[g];
}
y_out = y_out/HN;
y_out_avg += y_out/BATCH_SIZE;
y_exp_avg += data[base+i_b].teach/BATCH_SIZE;
}
//LOSS FUNCTION
error = pow((y_out_avg-y_exp_avg),2);
printf("\nESTIM_AVG\tREAL_AVG\tRELATIVE_ERROR");
relative_err = fabs((y_out_avg-y_exp_avg)/y_avg); //relative_error: (prezzo calcolato - prezzo atteso)/misura attesa media
printf("\n%lf\t%lf\t%lf\n", y_out_avg, y_exp_avg, relative_err);
//BACKPROPAGATION
//update bias and weight
for(k=0;k<HN;k++){
for(g=0; g<INPUTN; g++){
w[k][g] = w[k][g]-2*alpha*data[j].input[g]*(y_out_avg-y_exp_avg);
v[g]= v[g]-2*alpha*(y_out_avg-y_exp_avg);
}
b[k]= b[k]-2*alpha*(y_out_avg-y_exp_avg);
//b[k]= 0;
}
y_out_avg = 0.0;
y_exp_avg = 0.0;
}
}
double tf = omp_get_wtime(); //final time
double time = tf - ti; //effective time for the execution
printf ("Elapsed time: %lf\n", time);
return 0;
}
使用文件“ Dataset_3.txt”,其中有1000行数据,这里是10个数据的示例:您可以复制并粘贴并创建1000行的文件,也可以编辑代码以使其正确运行。
121.3168139 6.873759459 7 322386.5042
99.60902165 4.63043755 7 284554.0498
135.7221604 6.663354979 4 284796.0999
133.7192657 3.496973506 7 343977.1519
155.0125801 2.259712681 8 390169.2343
152.0527816 3.643403786 4 309419.1429
64.71485146 5.10618215 7 235827.262
130.6841885 5.405015338 4 280079.0986
56.36704 1.557336041 5 193401.2459
96.33489022 2.840480371 4 234694.1379
需要一些帮助以使用openmp加速程序执行。
您放置OpenMP指令的位置不正确,因为j
循环中有太多内容不打算并行执行。但是,您可以考虑并行化i_b
循环。对于这一点,例如,一个很好的陈述点是:
#pragma omp parallel for reduction(+:y_out_avg,y_exp_avg) private(k,g,y_out,hn_out)
如果/如果对代码的正确性感到满意,并且希望进一步深入并行化,那么可以考虑使用“ BACKPROPAGATION”循环,看看在那里可以做什么...