CUDA架构的高效实现

问题描述 投票:0回答:0

您对优化 CUDA 内核、

cuKer_det
cuKer_sum
有什么建议吗?

我在下面提供了完整的详细代码。请不要因为查看下面的代码大小而不知所措。我很乐意解决您想到的任何问题。

我相信GPU函数

cuKer_sum
cuKer_det
都可以实现更好的优化。

在这些内核中,我们对在

g_nxyz * g_temp_nt
块中独立计算的不同行列式进行求和。内核内部的求和有点复杂,所以修改它可能具有挑战性。但是,我将不胜感激任何建议。谢谢!

这是我当前的时间戳:

compiling...
running...

INIT
INIT 1.490000 s
NT_LOOP
-- QPROP 3.030000 s
-- INIT SUM 0.000000 s
-- CUKER 5.390000 s
nt 0     sum 1.2126701710009340E-06,2.0850628227617269E-09
nt 1     sum 1.2511169268362081E-05,2.9725532097971312E-08
-- FINAL SUM 0.000000 s
NT_LOOP 8.420000 s
CUFREE 0.000000 s
TOTAL EXEC: 9.91000 s
CU_DEV_RESET: 0.020000000 s

real    0m10.633s
user    0m5.749s
sys 0m4.706s
task finished!

代码可以使用:

#!/usr/bin/env bash

echo compiling...
nvcc -arch=sm_70 np.cu -o np.out

echo running...

time ./np.out

echo task finished!

np.cu
(对不起不能提供
q_nx48_nt144
文件。它是35GB。)

#include "np.cuh"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <complex.h>
#include <sys/mman.h>
#include <math.h>

#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

#include <inttypes.h>
#include <string.h>
#include <assert.h>

// DEFINE CONST

    const uint64_t g_count_imp_symm= 140126;
    const uint64_t g_count_deg_index= 38865;
    const uint64_t g_threads= 128;

    const uint64_t g_count_imp_count= 37586;
    const uint64_t g_count_imp_count_per_thread= 294; //ceil(g_count_imp_count/g_threads)

    const uint64_t g_nt = 144;
    const uint64_t g_temp_nt = 2;

    const uint64_t g_nx = 48;
    const uint64_t g_ny = 48;
    const uint64_t g_nz = 48;
    const uint64_t g_nxyz = 110592;
    const uint64_t g_nc = 3;
    const uint64_t g_nd = 4;

    const uint64_t LEN = 48;
    const uint64_t XDIM = (LEN*LEN*LEN);
    const uint64_t ADIM = 3;
    const uint64_t PDIM = 4;
    const uint64_t NRI = 2;
    const uint64_t T = 144;
    const uint64_t LEN_PROP_T = (XDIM * T * ADIM * ADIM * PDIM * PDIM);
    const uint64_t LEN_PROP_TEMP = (XDIM * g_temp_nt * ADIM * ADIM * PDIM * PDIM);


#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

double big_to_little(double big_endian) {
    union {
        double d;
        uint8_t bytes[8];
    } u;
    u.d = big_endian;
    for (int i = 0; i < 4; i++) {
        uint8_t tmp = u.bytes[i];
        u.bytes[i] = u.bytes[7 - i];
        u.bytes[7 - i] = tmp;
    }
    return u.d;
}


__device__ cuDoubleComplex cuKer_det(int gid, int tx,
                                     cuDoubleComplex *d_tqprop, 
                                     int *d_deg_ind,
                                     float *d_deg_c) {


    int b1, b2, b3; 
    int b1p, b2p, b3p; 
    int bt1, bt2, bt3; 
    int bt1p, bt2p, bt3p; 

    cuDoubleComplex d_A[9];
    cuDoubleComplex x1, x2, x3;
    cuDoubleComplex x123, r1x123;
    cuDoubleComplex r1;

    b1 = d_deg_ind[12 * tx];
    b2 = d_deg_ind[12 * tx + 1];
    b3 = d_deg_ind[12 * tx + 2];
    b1p = d_deg_ind[12 * tx + 3];
    b2p = d_deg_ind[12 * tx + 4];
    b3p = d_deg_ind[12 * tx + 5];
    bt1 = d_deg_ind[12 * tx + 6];
    bt2 = d_deg_ind[12 * tx + 7];
    bt3 = d_deg_ind[12 * tx + 8];
    bt1p = d_deg_ind[12 * tx + 9];
    bt2p = d_deg_ind[12 * tx + 10];
    bt3p = d_deg_ind[12 * tx + 11];

    d_A[0*3 + 0] = d_tqprop[gid * g_nc * g_nc * g_nd * g_nd +
                            b1*4*3*4 + bt1*3*4 + b1p*4 + bt1p];
    d_A[0*3 + 1] = d_tqprop[gid * g_nc * g_nc * g_nd * g_nd +
                            b1*4*3*4 + bt1*3*4 + b2p*4 + bt2p];
    d_A[0*3 + 2] = d_tqprop[gid * g_nc * g_nc * g_nd * g_nd +
                            b1*4*3*4 + bt1*3*4 + b3p*4 + bt3p];

    d_A[1*3 + 0] = d_tqprop[gid * g_nc * g_nc * g_nd * g_nd +
                            b2*4*3*4 + bt2*3*4 + b1p*4 + bt1p];
    d_A[1*3 + 1] = d_tqprop[gid * g_nc * g_nc * g_nd * g_nd +
                            b2*4*3*4 + bt2*3*4 + b2p*4 + bt2p];
    d_A[1*3 + 2] = d_tqprop[gid * g_nc * g_nc * g_nd * g_nd +
                            b2*4*3*4 + bt2*3*4 + b3p*4 + bt3p];

    d_A[2*3 + 0] = d_tqprop[gid * g_nc * g_nc * g_nd * g_nd +
                            b3*4*3*4 + bt3*3*4 + b1p*4 + bt1p];
    d_A[2*3 + 1] = d_tqprop[gid * g_nc * g_nc * g_nd * g_nd +
                            b3*4*3*4 + bt3*3*4 + b2p*4 + bt2p];
    d_A[2*3 + 2] = d_tqprop[gid * g_nc * g_nc * g_nd * g_nd +
                            b3*4*3*4 + bt3*3*4 + b3p*4 + bt3p];

    x1 = cuCmul( d_A[0*3 + 0], 
                 cuCsub( cuCmul(d_A[1*3 +1], d_A[2*3 +2]), 
                         cuCmul(d_A[1*3 +2], d_A[2*3 +1]) )  ); 
    x2 = cuCmul( d_A[0*3 + 1], 
                 cuCsub( cuCmul(d_A[1*3 +0], d_A[2*3 +2]), 
                         cuCmul(d_A[1*3 +2], d_A[2*3 +0]) )  );
    x3 = cuCmul( d_A[0*3 + 2], 
                 cuCsub( cuCmul(d_A[1*3 +0], d_A[2*3 +1]), 
                         cuCmul(d_A[1*3 +1], d_A[2*3 +0]) )  ); 

    r1 = make_cuDoubleComplex(d_deg_c[tx], 0.0);
    x123 = cuCadd( cuCsub(x1,x2), x3);
    r1x123 = cuCmul(r1, x123);
       
    return r1x123;

}

#define SHMEM_SIZE (g_threads)
__global__ void cuKer_sum(cuDoubleComplex *d_tqprop, 
                          cuDoubleComplex *d_sum_nxyz,
                          int *d_deg_ind, int *d_deg_where_d,
                          int *d_deg_len, int *d_where, int *d_start_deg,
                          float *d_deg_c, float *d_deg) {

    int tid = threadIdx.x;
    // int bid = blockIdx.x;
    int gid = blockIdx.x;// * blockDim.x + threadIdx.x;

    
    cuDoubleComplex sumA, temp_sum;
    temp_sum = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex x1, x2, x3;

    int start = 0;
    int tx, k;

    __shared__ double sh_sum_re[SHMEM_SIZE];
    __shared__ double sh_sum_im[SHMEM_SIZE];
    

    int min_tx_sum = (tid) * g_count_imp_count_per_thread;
    int max_tx_sum = (tid + 1) * g_count_imp_count_per_thread;

    for (tx = min_tx_sum; tx < max_tx_sum; tx++) {
        if (tx >= g_count_imp_count) {break;}

        x1 = cuKer_det(gid, d_deg_where_d[tx], d_tqprop, d_deg_ind, d_deg_c);

        sumA = make_cuDoubleComplex(0.0,0.0);
        
        start = d_start_deg[tx];
        for (k = start; k < start + d_deg_len[tx]; k++) {
            x2 = cuKer_det(gid, d_where[k], d_tqprop, d_deg_ind, d_deg_c);

            sumA = cuCadd(sumA, cuCmul(x2, make_cuDoubleComplex(d_deg[k], 0.0)) );

        }
        x3 = cuCadd(cuCmul(sumA, x1), x3);

        temp_sum = x3;
    }

    sh_sum_re[tid] = cuCreal(temp_sum);
    sh_sum_im[tid] = cuCimag(temp_sum);
    __syncthreads();




    // Perform block reduction in shared memory
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sh_sum_re[tid] += sh_sum_re[tid + s];
            sh_sum_im[tid] += sh_sum_im[tid + s];
        }
        __syncthreads();
    }


    if (tid == 0) {
        d_sum_nxyz[gid] = make_cuDoubleComplex(sh_sum_re[tid], sh_sum_im[tid]);
    }

}



int main(int argc, char *argv[]) {

    clock_t code_start, code_end;
    code_start = clock();

    clock_t ini_start, ini_end;
    ini_start = clock();

    //START INIT//
        printf("\nINIT\n");

        int d_trial;
        cudaMalloc((void **)&d_trial, sizeof(int));

        //READ INDEX FILES//
        
            int *where;
            float *deg;

            where = (int*)malloc(g_count_imp_symm * sizeof(int));
            deg = (float*)malloc(g_count_imp_symm * sizeof(float));

            f_read_imp_symm(where, deg);

            int *deg_ind;
            float *deg_c;

            deg_ind = (int*)malloc(12 * g_count_deg_index * sizeof(int));
            deg_c = (float*)malloc(1 * g_count_deg_index * sizeof(float));

            f_read_deg_index(deg_ind, deg_c);

            int *deg_where_d; 
            int *deg_len;
            int *start_deg;

            deg_where_d = (int*)malloc(g_count_imp_count * sizeof(int));
            deg_len = (int*)malloc(g_count_imp_count * sizeof(int));
            start_deg = (int *)malloc(g_count_imp_count * sizeof(int));

            f_read_imp_count(deg_where_d, deg_len, start_deg);


            // device variable independent of lattice index
            int *d_deg_ind;
            int *d_deg_where_d;
            int *d_deg_len;
            int *d_where;
            int *d_start_deg;

            float *d_deg_c;
            float *d_deg;

            cudaMalloc((void **)&d_deg_ind, sizeof(int) * 12 * g_count_deg_index );
            cudaMalloc((void **)&d_deg_where_d, sizeof(int) * g_count_imp_count );
            cudaMalloc((void **)&d_deg_len, sizeof(int) * g_count_imp_count );
            cudaMalloc((void **)&d_where, sizeof(int) * g_count_imp_symm );
            cudaMalloc((void **)&d_start_deg, sizeof(int) * g_count_imp_count );

            cudaMalloc((void **)&d_deg_c, sizeof(float) * g_count_deg_index );
            cudaMalloc((void **)&d_deg, sizeof(float) * g_count_imp_symm );

            cudaMemcpy(d_deg_len, deg_len, sizeof(int) * 
                                           g_count_imp_count, 
                                           cudaMemcpyHostToDevice );
            cudaMemcpy(d_deg_where_d, deg_where_d, sizeof(int) * 
                                                   g_count_imp_count, 
                                                   cudaMemcpyHostToDevice );
            cudaMemcpy(d_where, where, sizeof(int) * 
                                       g_count_imp_symm, 
                                       cudaMemcpyHostToDevice );  

            cudaMemcpy(d_deg_ind, deg_ind, sizeof(int) * 12 * 
                                           g_count_deg_index, 
                                           cudaMemcpyHostToDevice );
            cudaMemcpy(d_start_deg, start_deg, sizeof(int) * 
                                               g_count_imp_count, 
                                               cudaMemcpyHostToDevice );
            cudaMemcpy(d_deg_c, deg_c, sizeof(float) * 
                                       g_count_deg_index, 
                                       cudaMemcpyHostToDevice );
            cudaMemcpy(d_deg, deg, sizeof(float) * 
                                   g_count_imp_symm, 
                                   cudaMemcpyHostToDevice );
            

        //DEFINE CLOCK_T
        
            clock_t qprop_start, qprop_end;
            double qprop_time = 0.0;
            
            clock_t ini_sum_start, ini_sum_end;
            double ini_sum_time = 0.0;

            clock_t final_sum_start, final_sum_end;
            double final_sum_time = 0.0;

            clock_t cuker_sum_start, cuker_sum_end;
            double cuker_sum_time = 0.0;

            clock_t tdet_gpu, tdet_gpu_end;
            clock_t tsum_nt_start, tsum_nt_end;

            double tt_sum_nt = 0.0;
            double tt_det_gpu = 0.0;

            clock_t cufree_start, cufree_end;

         
        ini_end = clock();
        double ini_time = (double)((double)(ini_end-ini_start)/CLOCKS_PER_SEC);
        
        printf("INIT %f s \n", ini_time);
    //END INIT//


    //START NT_LOOP//
        clock_t ntloop_start, ntloop_end;
        ntloop_start = clock();
        printf("NT_LOOP\n");


        //START QPROP//
            qprop_start = clock();
            
            double complex *tqprop_new;

            tqprop_new = (double complex*)malloc(LEN_PROP_TEMP *
                                        sizeof(double complex));

            // tqprop = (double complex*)malloc(LEN_PROP * sizeof(double complex));

            memset(tqprop_new, 0.0, LEN_PROP_TEMP *
                                sizeof(double complex));

            double complex *tqprop;

            int len_tqprop = g_temp_nt * g_nx * g_ny * g_nz * 
                             g_nc * g_nd * g_nc * g_nd;
            tqprop = (double complex*)malloc(len_tqprop *
                                        sizeof(double complex));


            memset(tqprop, 0.0, len_tqprop *
                                sizeof(double complex));


            //START READ PROP//
                int fd = open("q_nx48_nt144", O_RDONLY);
                void *prop;

                prop = mmap(NULL, NRI * LEN_PROP_T * sizeof(double),
                            PROT_READ, MAP_PRIVATE, fd, 0);

                if (prop == MAP_FAILED) {
                    perror("mmap");
                    exit(EXIT_FAILURE);
                }
                
                double* prop_double = (double*) prop;  // cast void pointer to double pointer

                uint64_t idx, idx_re, idx_im;
                // idx = XDIM * T * ADIM * PDIM * ADIM * PDIM;
                uint64_t i, j, k, l, m, n;


                for (j = 0; j < g_temp_nt; j++) {
                    for (i = 0; i < XDIM; i++) {
                        for (m = 0; m < ADIM; m++) { //iic
                            for (n = 0; n < PDIM; n++) { //iid
                                for (k = 0; k < ADIM; k++) { //ifc
                                    for (l = 0; l < PDIM; l++) { //ifd
                                        idx =   j * XDIM * ADIM * PDIM * ADIM * PDIM + 
                                                i * ADIM * PDIM * ADIM * PDIM + 
                                                m * PDIM * ADIM * PDIM + 
                                                n * ADIM * PDIM +
                                                k * PDIM +
                                                l;


                                        idx_re =    n * ADIM * NRI * PDIM * ADIM * T * XDIM + 
                                                    m * NRI * PDIM * ADIM * T * XDIM + 
                                                    0 * PDIM * ADIM * T * XDIM + 
                                                    l * ADIM * T * XDIM + 
                                                    k * T * XDIM +
                                                    j * XDIM +
                                                    i;
                                        idx_im =    n * ADIM * NRI * PDIM * ADIM * T * XDIM + 
                                                    m * NRI * PDIM * ADIM * T * XDIM + 
                                                    1 * PDIM * ADIM * T * XDIM + 
                                                    l * ADIM * T * XDIM + 
                                                    k * T * XDIM +
                                                    j * XDIM +
                                                    i;

                                        tqprop[idx] = big_to_little(prop_double[idx_re]) + big_to_little(prop_double[idx_im]) * I ;


                                    }
                                }

                            }

                        }
                    }
                }

                munmap(prop, NRI * LEN_PROP_T * sizeof(double));
                close(fd);
            //END READ PROP//

            
            cuDoubleComplex *d_tqprop;  
            cudaMalloc(&d_tqprop, len_tqprop *
                                  sizeof(cuDoubleComplex));
            cudaMemcpy(d_tqprop,tqprop, len_tqprop *
                                        sizeof(cuDoubleComplex),
                                        cudaMemcpyHostToDevice);        

            qprop_end = clock();
            qprop_time += (double)((double)(qprop_end - qprop_start) / CLOCKS_PER_SEC);

            printf("-- QPROP %f s \n", qprop_time);
        //END QPROP//

        //START INIT_SUM//
            ini_sum_start = clock();
            double complex *sum_nt;
            sum_nt = (double complex*)malloc(g_temp_nt * 
                                             sizeof(cuDoubleComplex));

            memset(sum_nt, 0.0, g_temp_nt *
                                sizeof(double complex));

            double complex *sum_nxyz;
            sum_nxyz = (double complex*)malloc(g_temp_nt * g_nxyz * 
                                               sizeof(cuDoubleComplex));

            memset(sum_nxyz, 0.0, g_temp_nt * g_nxyz *
                                  sizeof(double complex));

            cuDoubleComplex *d_sum_nxyz; 
            cudaMalloc(&d_sum_nxyz, g_temp_nt * g_nxyz * 
                                    sizeof(cuDoubleComplex));
            cudaMemset(&d_sum_nxyz, 0.0, g_temp_nt * g_nxyz * 
                                         sizeof(cuDoubleComplex));


            ini_sum_end = clock();
            ini_sum_time += (double)((double)(ini_sum_end - ini_sum_start) / CLOCKS_PER_SEC);
            printf("-- INIT SUM %f s \n", ini_sum_time);        
        //END INIT_SUM//


        //START CUKER//
            tdet_gpu = clock();

            dim3 block(g_threads);
            dim3 grid(g_nxyz * g_temp_nt);

            //START CUKER_SUM//
                cuker_sum_start = clock();
                printf("Going inside cuKer");
                cuKer_sum <<< grid, block >>> (d_tqprop, d_sum_nxyz,
                                               d_deg_ind, d_deg_where_d, 
                                               d_deg_len, d_where, d_start_deg,
                                               d_deg_c, d_deg);
                cudaDeviceSynchronize();

                cuker_sum_end = clock();
                cuker_sum_time += (double)((double)(cuker_sum_end - cuker_sum_start) / CLOCKS_PER_SEC);
            //END CUKER_SUM//


            tdet_gpu_end = clock();
            tt_det_gpu = tt_det_gpu + (double)((double)(tdet_gpu_end - tdet_gpu) / CLOCKS_PER_SEC);
            
            printf("-- CUKER %f s \n", tt_det_gpu);
        //END CUKER//


        //START FINAL SUM//
            final_sum_start = clock();

            cudaMemcpy(sum_nxyz, d_sum_nxyz, g_temp_nt * g_nxyz *
                                             sizeof(double complex),
                                             cudaMemcpyDeviceToHost );        

            tsum_nt_start = clock();
            

            for (int sit = 0; sit < g_temp_nt; sit++) {
                for (int si = 0; si < g_nxyz; si++) {
                    sum_nt[sit] = sum_nt[sit] + sum_nxyz[si + sit * g_nxyz];
                }
            }
                
            tsum_nt_end = clock();
            tt_sum_nt += (double)((double)(tsum_nt_end - tsum_nt_start) / CLOCKS_PER_SEC);



            for (int sit = 0; sit < g_temp_nt; sit++) {
                printf("nt %d \t sum %.16E,%.16E \n", sit, 
                                                      creal(sum_nt[sit]), 
                                                      cimag(sum_nt[sit]));
            }    

            free(sum_nxyz);
            free(sum_nt);
            
            final_sum_end = clock();
            final_sum_time += (double)((double)(final_sum_end - final_sum_start) / CLOCKS_PER_SEC);  
            printf("-- FINAL SUM %f s \n", final_sum_time);
        //END FINAL SUM//


        ntloop_end = clock();
        double ntloop_time = (double)((double)(ntloop_end-ntloop_start)/CLOCKS_PER_SEC);
        printf("NT_LOOP %f s \n", ntloop_time);
    //END NT_LOOP//
    
    
    //START CUFREE//
        cufree_start = clock();

        cudaFree(d_deg_ind);
        cudaFree(d_deg_where_d);
        cudaFree(d_deg_len);
        cudaFree(d_where);
        cudaFree(d_start_deg);

        cudaFree(d_deg_c);
        cudaFree(d_deg);

        cudaFree(d_tqprop);
        cudaFree(d_sum_nxyz);

        cufree_end = clock();    

        double cufree_time = (double)((double)(cufree_end-cufree_start)/CLOCKS_PER_SEC);
        printf("CUFREE %f s \n", cufree_time);
    //END CUFREE//


    code_end = clock();
    //PRINT CPU TIME//
    
    double code_time = (double)((double)(code_end-code_start)/CLOCKS_PER_SEC);
    printf("TOTAL EXEC: %.5f s \n", code_time);
                

    //PRINT CU_DEV_RESET//
        clock_t cu_reset_start, cu_reset_end;
        cu_reset_start = clock();

        cudaDeviceReset();

        cu_reset_end = clock();

        double cu_reset_time = (double)((double)(cu_reset_end-cu_reset_start)/CLOCKS_PER_SEC);
        printf("CU_DEV_RESET: %4.9f s \n", cu_reset_time);


    return 0;

}    

np.cuh

#ifndef __NP_CUH_
#define __NP_CUH_

//for memset
#include <cstring>

#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#include "read_files.h"



#endif

read_files.h

#ifndef __READ_FILES_H_
#define __READ_FILES_H_

#include <iostream>
#include <stdio.h>
#include <complex.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <ctime>
#include <stdint.h>

#include <assert.h>
#include <cassert>

#include <sys/mman.h>

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

#include <inttypes.h>


extern const uint64_t g_count_imp_symm;
extern const uint64_t g_count_deg_index;
extern const uint64_t g_count_imp_count;

extern const uint64_t g_nt;
extern const uint64_t g_nx;
extern const uint64_t g_ny;
extern const uint64_t g_nz;
extern const uint64_t g_nxyz;
extern const uint64_t g_nc;
extern const uint64_t g_nd;

extern const uint64_t msp;
extern const uint64_t g_msp;



int U = 3, D = 3;
int itl = 144, nc = 3, ns = 4, nri = 2, mdim = 4;
int nd = 4;

int nt = 144, nx = 48, ny = 48, nz = 48;

#define mx 48
#define my 48
#define mz 48
#define mt 144
#define msp mx*my*mz
#define ncs nc*ns



double  ******q;
FILE *quark1;

void f_read_imp_symm(int *where, float *deg);
void f_read_deg_index(int *deg_ind, float *deg_c);
void f_read_imp_count(int *deg_where_d, int *deg_len);

void f_read_prop();
void f_prop_compute(int a);
double f_read_8B_double ( FILE* fp);

void f_read_imp_symm(int *where, float *deg) {

    /* A: barrier 1- read imp_symm*/
    FILE *fptr1;
    fptr1 = fopen("imp_symm.txt", "r");

    for (int i = 0; i < g_count_imp_symm; i++) {
        fscanf(fptr1, "%d\n", &(where[i]));
        fscanf(fptr1, "%f\n", &(deg[i]));
    }
    fclose(fptr1);


}

void f_read_deg_index(int *deg_ind, float *deg_c) {
    FILE *fptr2;
    fptr2 = fopen("deg_index.txt", "r");

    for (int i = 0; i < g_count_deg_index; i++) {
        fscanf(fptr2, "%d %d %d %d %d %d %d %d %d %d %d %d %f\n",
               &(deg_ind[12 * i]),
               &(deg_ind[12 * i +  1]),
               &(deg_ind[12 * i +  2]),
               &(deg_ind[12 * i +  3]),
               &(deg_ind[12 * i +  4]),
               &(deg_ind[12 * i +  5]),
               &(deg_ind[12 * i +  6]),
               &(deg_ind[12 * i +  7]),
               &(deg_ind[12 * i +  8]),
               &(deg_ind[12 * i +  9]),
               &(deg_ind[12 * i +  10]),
               &(deg_ind[12 * i +  11]),
               &(deg_c[i]));
    }
    fclose(fptr2);


}

void f_read_imp_count(int *deg_where_d, int *deg_len, int *start_deg) {
    FILE *fptr2;
    fptr2 = fopen("imp_count.txt", "r");

    for (int i = 0; i < g_count_imp_count; i++) {
        fscanf(fptr2, "%d\n%d\n",
               &(deg_where_d[i]), &(deg_len[i]));
    }
    fclose(fptr2);


    int sum = 0;
    for (int i = 0; i < g_count_imp_count; i++)
    {
        start_deg[i] = sum;
        sum = sum + deg_len[i];
    }


}





#endif

我希望减少 CUDA 内核

cuKer_sum
cuKer_det
中消耗的时间。 这比
-- CUKER 5.390000 s
少一些
g_temp_nt
.

现在

-- CUKER
时间戳随着
g_temp_nt
的增加而线性变化。也就是说,即使
g_temp_nt
有单独的块,时间复杂度也会以串行方式增加。

我认为一个有效的实施不应该随着

g_temp_nt
而扩展。在这段代码中,最后我们想将当前设置为 2 的
g_temp_nt
更改为
g_nt = 144

我试过上面的架构。上面提供了完整的代码。

非常感谢任何关于改进架构或良好编码实践技巧的建议。

c++ c performance cuda
© www.soinside.com 2019 - 2024. All rights reserved.