如何使用openmp并行化此c代码?

问题描述 投票:2回答:1

这是使用openmp指令进行编辑以提高执行速度并且低于程序所使用文件结构的c代码。我尝试使用:

#pragma omp parallel for reduction (+: hn_out, y_out) private (k,g) shared (y_out_avg, y_exp_avg)

但是它不起作用,预期结果是错误的,并且与串行结果不同。我认为并行化存在逻辑错误,我的意思是该算法必须以另一种方式并行化。

// FEEDFORWARD AND BACKPROPAGATION ALGORITHM
// WITH IMPLEMENTAtION OF BATCH TECHNIQUE 
// compute the error in a batch of 5 input and then propagate the error, usefull for the parallelization.
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <omp.h>

#define INPUTN 3        // number of neurons in the input layer
#define HN 3            // number of neurons in the hidden layer
#define OUTN 1          // number of neurons in the output layer
#define DATANUM 1000    // number of training samples
#define EPOCHS 1000
#define BATCH_SIZE 20


typedef struct DataS{
        double input[INPUTN];
        double teach;
}DataS;

int main(){

    double alpha = 0.0000001;       //learning rate
    double hn_out[HN];
    double price_M;
    double y_out = 0.0;
    double error;                   //loss function 
    int k,g;
    double delta_y;
    double delta_w[HN][INPUTN];
    double delta_b[HN];
    DataS data[DATANUM];
    double w[HN][INPUTN];
    double v[HN];
    double b[HN];
    FILE *fp1;
    double relative_err = 0;
    double y_avg = 0.0;
    double y_out_avg = 0.0;
    double y_exp_avg = 0.0;

    //weights initialization
    for(int i=0; i<HN; i++){
        v[i]= 1.0;
        for(int j=0; j<INPUTN; j++)
            w[i][j]= 1.0;
        b[i]=0.0;
    }

    //get Dataset
    fp1 = fopen("Dataset_3.txt", "r");
    if(fp1 == NULL)
    {
        printf("cannot open file");
        exit(1);
    }
    for(int i=0;i<DATANUM; i++){
        fscanf(fp1, "%lf\t%lf\t%lf\t%lf", &data[i].input[0], &data[i].input[1], &data[i].input[2], &data[i].teach);
        printf("%lf\t%lf\t%lf\t%lf\n", data[i].input[0], data[i].input[1], data[i].input[2], data[i].teach);
        y_avg += data[i].teach/DATANUM;
    }
    fclose(fp1);

    //START ALGORITHM
    double ti = omp_get_wtime();    //initial time
    for (int i = 0; i < EPOCHS; i ++) {
        printf("\nepoch %d) ", i);
        relative_err=0;
        #pragma omp parallel for reduction (+: hn_out, y_out) private (k,g) shared (y_out_avg, y_exp_avg)
        for(int j=0; j<DATANUM/BATCH_SIZE; j++){
            //FEEDFORWARD
            //compute hn_out[HN]
            int base = j*BATCH_SIZE;
            printf("Avg of data:");
            for(int i_b=0; i_b<BATCH_SIZE; i_b++){
                printf(" %d", base+i_b);
                for(k=0; k<HN; k++){
                    hn_out[k]= 0.0;
                }

                for(k=0; k<HN; k++){
                    for(g=0; g<INPUTN; g++){
                        hn_out[k]+= w[k][g]*data[base+i_b].input[g];
                    }
                    hn_out[k]+= b[k];
                }

                //compute y_out[OUTN]
                y_out= 0.0;             
                for(g=0; g<HN; g++){
                    y_out += hn_out[g]*v[g];
                }
                y_out = y_out/HN;
                y_out_avg += y_out/BATCH_SIZE;
                y_exp_avg += data[base+i_b].teach/BATCH_SIZE;
            } 
            //LOSS FUNCTION
            error = pow((y_out_avg-y_exp_avg),2);
            printf("\nESTIM_AVG\tREAL_AVG\tRELATIVE_ERROR");
            relative_err = fabs((y_out_avg-y_exp_avg)/y_avg);   //relative_error: (prezzo calcolato - prezzo atteso)/misura attesa media
            printf("\n%lf\t%lf\t%lf\n", y_out_avg, y_exp_avg, relative_err);

            //BACKPROPAGATION
            //update bias and weight
            for(k=0;k<HN;k++){
                for(g=0; g<INPUTN; g++){
                    w[k][g] = w[k][g]-2*alpha*data[j].input[g]*(y_out_avg-y_exp_avg);
                    v[g]= v[g]-2*alpha*(y_out_avg-y_exp_avg);
                }
                b[k]= b[k]-2*alpha*(y_out_avg-y_exp_avg);
                //b[k]= 0;
            }
            y_out_avg = 0.0;
            y_exp_avg = 0.0;
        }
    }
    double tf = omp_get_wtime();    //final time
    double time = tf - ti;          //effective time for the execution
    printf ("Elapsed time: %lf\n", time);  
    return 0;
}

使用文件“ Dataset_3.txt”,其中有1000行数据,这里是10个数据的示例:您可以复制并粘贴并创建1000行的文件,也可以编辑代码以使其正确运行。

121.3168139 6.873759459 7   322386.5042
99.60902165 4.63043755  7   284554.0498
135.7221604 6.663354979 4   284796.0999
133.7192657 3.496973506 7   343977.1519
155.0125801 2.259712681 8   390169.2343
152.0527816 3.643403786 4   309419.1429
64.71485146 5.10618215  7   235827.262
130.6841885 5.405015338 4   280079.0986
56.36704    1.557336041 5   193401.2459
96.33489022 2.840480371 4   234694.1379

需要一些帮助以使用openmp加速程序执行。

c parallel-processing neural-network openmp static-code-analysis
1个回答
0
投票

您放置OpenMP指令的位置不正确,因为j循环中有太多内容不打算并行执行。但是,您可以考虑并行化i_b循环。对于这一点,例如,一个很好的陈述点是:

#pragma omp parallel for reduction(+:y_out_avg,y_exp_avg) private(k,g,y_out,hn_out)

如果/如果对代码的正确性感到满意,并且希望进一步深入并行化,那么可以考虑使用“ BACKPROPAGATION”循环,看看在那里可以做什么...

© www.soinside.com 2019 - 2024. All rights reserved.