我有一个 HPCCG 文件,我一直在尝试编译。我想使用 Intel oneAPI 编译器对其进行编译,它成功完成了,但生成的结果表明计时器实现不起作用。我注意到当我使用 g++ 编译器时它可以工作。这是定时器实现源码如下:
//@HEADER
// ************************************************************************
//
// HPCCG: Simple Conjugate Gradient Benchmark Code
// Copyright (2006) Sandia Corporation
//
// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
// license for use of this work by or on behalf of the U.S. Government.
//
// BSD 3-Clause License
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Michael A. Heroux ([email protected])
//
// ************************************************************************
//@HEADER
/////////////////////////////////////////////////////////////////////////
// Function to return time in seconds.
// If compiled with no flags, return CPU time (user and system).
// If compiled with -DWALL, returns elapsed time.
/////////////////////////////////////////////////////////////////////////
#ifdef USING_MPI
#include <mpi.h> // If this routine is compiled with -DUSING_MPI
// then include mpi.h
double mytimer(void)
{
return(MPI_Wtime());
}
#elif defined(UseClock)
#include <time.hpp>
double mytimer(void)
{
clock_t t1;
static clock_t t0=0;
static double CPS = CLOCKS_PER_SEC;
double d;
if (t0 == 0) t0 = clock();
t1 = clock() - t0;
d = t1 / CPS;
return(d);
}
#elif defined(WALL)
#include <cstdlib>
#include <sys/time.h>
#include <sys/resource.h>
double mytimer(void)
{
struct timeval tp;
static long start=0, startu;
if (!start)
{
gettimeofday(&tp, NULL);
start = tp.tv_sec;
startu = tp.tv_usec;
return(0.0);
}
gettimeofday(&tp, NULL);
return( ((double) (tp.tv_sec - start)) + (tp.tv_usec-startu)/1000000.0 );
}
#elif defined(UseTimes)
#include <cstdlib>
#include <sys/times.h>
#include <unistd.h>
double mytimer(void)
{
struct tms ts;
static double ClockTick=0.0;
if (ClockTick == 0.0) ClockTick = (double) sysconf(_SC_CLK_TCK);
times(&ts);
return( (double) ts.tms_utime / ClockTick );
}
#else
#include <cstdlib>
#include <sys/time.h>
#include <sys/resource.h>
double mytimer(void)
{
struct rusage ruse;
getrusage(RUSAGE_SELF, &ruse);
return( (double)(ruse.ru_utime.tv_sec+ruse.ru_utime.tv_usec / 1000000.0) );
}
#endif
我尝试用 icpx 编译它,这些是我的结果:
#********** Performance Summary (times in sec) ***********:
Time Summary:
Total : 0
DDOT : 0
WAXPBY : 0
SPARSEMV: 0
FLOPS Summary:
Total : 9.536e+06
DDOT : 596000
WAXPBY : 894000
SPARSEMV: 8.046e+06
MFLOPS Summary:
Total : inf
DDOT : inf
WAXPBY : inf
SPARSEMV: inf
当我将编译器更改为 g++ 时。这是我的结果
#********** Performance Summary (times in sec) ***********:
Time Summary:
Total : 0.018251
DDOT : 0.001142
WAXPBY : 0.001825
SPARSEMV: 0.014805
FLOPS Summary:
Total : 9.536e+06
DDOT : 596000
WAXPBY : 894000
SPARSEMV: 8.046e+06
MFLOPS Summary:
Total : 522.492
DDOT : 521.891
WAXPBY : 489.863
SPARSEMV: 543.465
我确认我可以使用 oneAPI 编译器重现该问题,直至版本 2023.2.3。 2024.0.0 版本已修复。因此,您可以将编译器更新到该版本,或者通过在 Makefile 中使用适当的宏来选择另一个计时器。
注意,如果您使用 MPI,则 makefile 已设置为自动选择最佳计时器 (
MPI_Wtime()
)。
但是,通过将 -DWALL
开关添加到编译器选项中,可以获得默认计时器的绝佳替代方案。
$ module load intel/oneapi/2023.2.3
$ make
icpx -O3 -ftree-vectorize -DWALL -c -o main.o main.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o generate_matrix.o generate_matrix.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o read_HPC_row.o read_HPC_row.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o compute_residual.o compute_residual.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o mytimer.o mytimer.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o dump_matlab_matrix.o dump_matlab_matrix.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o HPC_sparsemv.o HPC_sparsemv.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o HPCCG.o HPCCG.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o waxpby.o waxpby.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o ddot.o ddot.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o make_local_matrix.o make_local_matrix.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o exchange_externals.o exchange_externals.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o YAML_Element.o YAML_Element.cpp
icpx -O3 -ftree-vectorize -DWALL -c -o YAML_Doc.o YAML_Doc.cpp
icpx -O3 -ftree-vectorize -DWALL main.o generate_matrix.o read_HPC_row.o compute_residual.o mytimer.o dump_matlab_matrix.o HPC_sparsemv.o HPCCG.o waxpby.o ddot.o make_local_matrix.o exchange_externals.o YAML_Element.o YAML_Doc.o -lm -o test_HPCCG
$ ./test_HPCCG 64 64 64
Initial Residual = 1654.81
Iteration = 15 Residual = 19.7523
Iteration = 30 Residual = 0.107645
Iteration = 45 Residual = 0.000229062
Iteration = 60 Residual = 4.03876e-07
Iteration = 75 Residual = 4.4229e-10
Iteration = 90 Residual = 5.59501e-13
Iteration = 105 Residual = 3.9836e-16
Iteration = 120 Residual = 1.75613e-19
Iteration = 135 Residual = 5.38692e-22
Iteration = 149 Residual = 7.18448e-25
Mini-Application Name: hpccg
Mini-Application Version: 1.0
Parallelism:
MPI not enabled:
OpenMP not enabled:
Dimensions:
nx: 64
ny: 64
nz: 64
Number of iterations: 149
Final residual: 7.18448e-25
#********** Performance Summary (times in sec) ***********:
Time Summary:
Total : 0.67078
DDOT : 0.024968
WAXPBY : 0.066434
SPARSEMV: 0.579066
FLOPS Summary:
Total : 2.49981e+09
DDOT : 1.56238e+08
WAXPBY : 2.34357e+08
SPARSEMV: 2.10921e+09
MFLOPS Summary:
Total : 3726.71
DDOT : 6257.52
WAXPBY : 3527.66
SPARSEMV: 3642.44