diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml new file mode 100644 index 0000000..a92b491 --- /dev/null +++ b/.github/workflows/cmake.yml @@ -0,0 +1,51 @@ +name: cmake + +on: + push: + paths: + - "**.c" + - "**.f" + - "**/CMakeLists.txt" + - ".github/workflows/cmake.yml" + +env: + CTEST_NO_TESTS_ACTION: error + + +jobs: + + unix: + timeout-minutes: 5 + + strategy: + matrix: + cc: [gcc-12, clang] + os: [ubuntu-latest, macos-latest] + + runs-on: ${{ matrix.os }} + + env: + CC: ${{ matrix.cc }} + FC: gfortran-12 + + steps: + - uses: actions/checkout@v4 + + - run: cmake -B build + + - run: cmake --build build --parallel + + - run: ctest --test-dir build -V + + + windows-msvc: + runs-on: windows-latest + + steps: + - uses: actions/checkout@v4 + + - run: cmake -G "Visual Studio 17 2022" -B build -Dfortran=no + + - run: cmake --build build --parallel --config Release + + - run: ctest --test-dir build -V -C Release diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..b29ea54 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,69 @@ +cmake_minimum_required(VERSION 3.15) + +project( + STREAM + VERSION 1.0 + DESCRIPTION "STREAM benchmark" + LANGUAGES C) + +enable_testing() + +option(fortran "build fortran version" ON) + +if(fortran) + enable_language(Fortran) +endif() + +if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)") + add_compile_options("$<$:-O3;-march=native;-Wall>") +elseif(CMAKE_C_COMPILER_ID MATCHES "Intel") + add_compile_options("$<$:-O3;-Wall>") + if(WIN32) + add_compile_options("$<$:/QxHost>") + else() + add_compile_options("$<$:-xHost>") + endif() +elseif(CMAKE_C_COMPILER_ID STREQUAL "MSVC") + add_compile_options("$<$:/W3>") +endif() + +if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") + add_compile_options("$<$:-O3;-march=native;-Werror=line-truncation;-Wall>") +elseif(CMAKE_C_COMPILER_ID MATCHES "Intel") + add_compile_options("$<$:-O3;-warn>") + if(WIN32) + add_compile_options("$<$:/QxHost>") + else() + add_compile_options("$<$:-xHost>") + endif() +endif() + +# Look for OpenMP support is found, link it to the executables +# Note that if you are using clang on macOS, you will need to +# install libomp via Homebrew and then set the following +# environment variables: +# export OpenMP_ROOT=$(brew --prefix)/opt/libomp +# see https://www.scivision.dev/cmake-openmp/ for more details + +find_package(OpenMP COMPONENTS C Fortran) + +# --- C stream_c + +add_executable(stream_c stream.c) +target_link_libraries(stream_c PRIVATE $<$:OpenMP::OpenMP_C>) + +add_test(NAME STREAM_C COMMAND stream_c) + +# --- Fortran stream_f + +if(fortran) +add_executable(stream_f stream.f) +target_link_libraries(stream_f PRIVATE $<$:OpenMP::OpenMP_Fortran>) + +add_test(NAME STREAM_Fortran COMMAND stream_f) +endif() + +# --- ignore build directory +if(NOT PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR) + file(GENERATE OUTPUT .gitignore CONTENT "*") +endif() diff --git a/Makefile b/Makefile index 7746d86..518b758 100644 --- a/Makefile +++ b/Makefile @@ -6,10 +6,9 @@ FFLAGS = -O2 -fopenmp all: stream_f.exe stream_c.exe -stream_f.exe: stream.f mysecond.o - $(CC) $(CFLAGS) -c mysecond.c +stream_f.exe: stream.f $(FC) $(FFLAGS) -c stream.f - $(FC) $(FFLAGS) stream.o mysecond.o -o stream_f.exe + $(FC) $(FFLAGS) stream.o -o stream_f.exe stream_c.exe: stream.c $(CC) $(CFLAGS) stream.c -o stream_c.exe diff --git a/mysecond.c b/mysecond.c deleted file mode 100644 index d206a4a..0000000 --- a/mysecond.c +++ /dev/null @@ -1,27 +0,0 @@ -/* A gettimeofday routine to give access to the wall - clock timer on most UNIX-like systems. - - This version defines two entry points -- with - and without appended underscores, so it *should* - automagically link with FORTRAN */ - -#include - -double mysecond() -{ -/* struct timeval { long tv_sec; - long tv_usec; }; - -struct timezone { int tz_minuteswest; - int tz_dsttime; }; */ - - struct timeval tp; - struct timezone tzp; - int i; - - i = gettimeofday(&tp,&tzp); - return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); -} - -double mysecond_() {return mysecond();} - diff --git a/stream.c b/stream.c index 9bbd6ce..c5df71b 100644 --- a/stream.c +++ b/stream.c @@ -41,12 +41,19 @@ /* 5. Absolutely no warranty is expressed or implied. */ /*-----------------------------------------------------------------------*/ # include -# include # include # include -# include + +#ifdef _MSC_VER +#define WIN32_LEAN_AND_MEAN +#include +#define ssize_t ptrdiff_t +#define CLOCK_MONOTONIC 0 +#else # include +#endif +#include /*----------------------------------------------------------------------- * INSTRUCTIONS: * @@ -62,13 +69,13 @@ * Example 1: One Xeon E3 with 8 MB L3 cache * STREAM_ARRAY_SIZE should be >= 4 million, giving * an array size of 30.5 MB and a total memory requirement - * of 91.5 MB. + * of 91.5 MB. * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) * STREAM_ARRAY_SIZE should be >= 20 million, giving * an array size of 153 MB and a total memory requirement - * of 458 MB. + * of 458 MB. * (b) The size should be large enough so that the 'timing calibration' - * output by the program is at least 20 clock-ticks. + * output by the program is at least 20 clock-ticks. * Example: most versions of Windows have a 10 millisecond timer * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. @@ -77,7 +84,7 @@ * Version 5.10 increases the default array size from 2 million * elements to 10 million elements in response to the increasing * size of L3 caches. The new default size is large enough for caches - * up to 20 MB. + * up to 20 MB. * Version 5.10 changes the loop index variables from "register int" * to "ssize_t", which allows array indices >2^32 (4 billion) * on properly configured 64-bit systems. Additional compiler options @@ -113,8 +120,8 @@ #endif /* Users are allowed to modify the "OFFSET" variable, which *may* change the - * relative alignment of the arrays (though compilers may change the - * effective offset by making the arrays non-contiguous on some systems). + * relative alignment of the arrays (though compilers may change the + * effective offset by making the arrays non-contiguous on some systems). * Use of non-zero values for OFFSET can be especially helpful if the * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. * OFFSET can also be set on the compile line without changing the source @@ -126,7 +133,7 @@ /* * 3) Compile the code with optimization. Many compilers generate - * unreasonably bad code before the optimizer tightens things up. + * unreasonably bad code before the optimizer tightens things up. * If the results are unreasonably good, on the other hand, the * optimizer might be too smart for me! * @@ -137,7 +144,7 @@ * To use multiple cores, you need to tell the compiler to obey the OpenMP * directives in the code. This varies by compiler, but a common example is * gcc -O -fopenmp stream.c -o stream_omp - * The environment variable OMP_NUM_THREADS allows runtime control of the + * The environment variable OMP_NUM_THREADS allows runtime control of the * number of threads/cores used when the resulting "stream_omp" program * is executed. * @@ -146,9 +153,9 @@ * to the compile line. * Note that this changes the minimum array sizes required --- see (1) above. * - * The preprocessor directive "TUNED" does not do much -- it simply causes the + * The preprocessor directive "TUNED" does not do much -- it simply causes the * code to call separate functions to execute each kernel. Trivial versions - * of these functions are provided, but they are *not* tuned -- they just + * of these functions are provided, but they are *not* tuned -- they just * provide predefined interfaces to be replaced with tuned code. * * @@ -193,7 +200,26 @@ static double bytes[4] = { 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE }; -extern double mysecond(); +#ifdef _MSC_VER +int clock_gettime(int dummy, struct timespec *spec) +{ + /* https://stackoverflow.com/a/31335254 */ + __int64 wintime; + GetSystemTimeAsFileTime((FILETIME*)&wintime); + wintime -=116444736000000000i64; //1jan1601 to 1jan1970 + spec->tv_sec =wintime / 10000000i64; //seconds + spec->tv_nsec =wintime % 10000000i64 *100; //nano-seconds + return 0; +} +#endif + +double mysecond() +{ + struct timespec tic; + clock_gettime(CLOCK_MONOTONIC, &tic); + return tic.tv_sec + tic.tv_nsec * 1e-9; +} + extern void checkSTREAMresults(); #ifdef TUNED extern void tuned_STREAM_Copy(); @@ -233,19 +259,19 @@ main() #endif printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); - printf("Memory per array = %.1f MiB (= %.1f GiB).\n", + printf("Memory per array = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); printf("Total memory required = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); printf("Each kernel will be executed %d times.\n", NTIMES); - printf(" The *best* time for each kernel (excluding the first iteration)\n"); + printf(" The *best* time for each kernel (excluding the first iteration)\n"); printf(" will be used to compute the reported bandwidth.\n"); #ifdef _OPENMP printf(HLINE); -#pragma omp parallel +#pragma omp parallel { #pragma omp master { @@ -258,7 +284,7 @@ main() #ifdef _OPENMP k = 0; #pragma omp parallel -#pragma omp atomic +#pragma omp atomic k++; printf ("Number of Threads counted = %i\n",k); #endif @@ -273,7 +299,7 @@ main() printf(HLINE); - if ( (quantum = checktick()) >= 1) + if ( (quantum = checktick()) >= 1) printf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { @@ -282,10 +308,11 @@ main() quantum = 1; } - t = mysecond(); + t = mysecond(); #pragma omp parallel for for (j = 0; j < STREAM_ARRAY_SIZE; j++) a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); printf("Each test below will take on the order" @@ -300,7 +327,7 @@ main() printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); - + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; @@ -315,7 +342,7 @@ main() c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; - + times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); @@ -325,7 +352,7 @@ main() b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; - + times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); @@ -335,7 +362,7 @@ main() c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; - + times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); @@ -358,7 +385,7 @@ main() maxtime[j] = MAX(maxtime[j], times[j][k]); } } - + printf("Function Best Rate MB/s Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); @@ -411,22 +438,6 @@ checktick() } - -/* A gettimeofday routine to give access to the wall - clock timer on most UNIX-like systems. */ - -#include - -double mysecond() -{ - struct timeval tp; - struct timezone tzp; - int i; - - i = gettimeofday(&tp,&tzp); - return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); -} - #ifndef abs #define abs(a) ((a) >= 0 ? (a) : -(a)) #endif @@ -476,7 +487,7 @@ void checkSTREAMresults () epsilon = 1.e-13; } else { - printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE)); + printf("WEIRD: sizeof(STREAM_TYPE) = %zu\n",sizeof(STREAM_TYPE)); epsilon = 1.e-6; } diff --git a/stream.f b/stream.f index e93e453..6aa0dc0 100644 --- a/stream.f +++ b/stream.f @@ -25,7 +25,7 @@ * accordance with the STREAM Run Rules must be clearly * labelled whenever they are published. Examples of * proper labelling include: -* "tuned STREAM benchmark results" +* "tuned STREAM benchmark results" * "based on a variant of the STREAM benchmark code" * Other comparable, clear and reasonable labelling is * acceptable. @@ -48,11 +48,9 @@ * Most of the content is currently hosted at: * http://www.cs.virginia.edu/stream/ * -* BRIEF INSTRUCTIONS: +* BRIEF INSTRUCTIONS: * 0) See http://www.cs.virginia.edu/stream/ref.html for details -* 1) STREAM requires a timing function called mysecond(). -* Several examples are provided in this directory. -* "CPU" timers are only allowed for uniprocessor runs. +* 1) "CPU" timers are only allowed for uniprocessor runs. * "Wall-clock" timers are required for all multiprocessor runs. * 2) The STREAM array sizes must be set to size the test. * The value "N" must be chosen so that each of the three @@ -64,7 +62,7 @@ * that makes each array 4x larger than the last-level cache. * The intent is to determine the *best* sustainable bandwidth * available with this simple coding. Of course, lower values -* are usually fairly easy to obtain on cached machines, but +* are usually fairly easy to obtain on cached machines, but * by keeping the test to the *best* results, the answers are * easier to interpret. * You may put the arrays in common or not, at your discretion. @@ -91,48 +89,51 @@ *========================================================================= * PROGRAM stream -* IMPLICIT NONE + + use, intrinsic :: iso_fortran_env, only : int64 + IMPLICIT NONE C .. Parameters .. - INTEGER n,offset,ndim,ntimes + INTEGER :: n,offset,ndim,ntimes PARAMETER (n=20000000,offset=0,ndim=n+offset,ntimes=10) C .. C .. Local Scalars .. - DOUBLE PRECISION scalar,t - INTEGER j,k,nbpw,quantum + DOUBLE PRECISION :: scalar + integer(int64) :: t64, tic, toc + integer(int64) :: tick_rate + INTEGER ::j,k,nbpw C .. C .. Local Arrays .. - DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4), + DOUBLE PRECISION :: maxtime(4),mintime(4),avgtime(4), $ times(4,ntimes) - INTEGER bytes(4) - CHARACTER label(4)*11 + INTEGER :: bytes(4) + CHARACTER(11) ::label(4) C .. -C .. External Functions .. - DOUBLE PRECISION mysecond - INTEGER checktick,realsize - EXTERNAL mysecond,checktick,realsize -!$ INTEGER omp_get_num_threads -!$ EXTERNAL omp_get_num_threads + +!$ INTEGER, external :: omp_get_num_threads C .. C .. Intrinsic Functions .. C - INTRINSIC dble,max,min,nint,sqrt + INTRINSIC :: dble,max,min,nint,sqrt C .. C .. Arrays in Common .. - DOUBLE PRECISION a(ndim),b(ndim),c(ndim) -C .. -C .. Common blocks .. -* COMMON a,b,c + DOUBLE PRECISION, allocatable, dimension(:) :: a, b, c C .. C .. Data statements .. - DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/ - DATA label/'Copy: ','Scale: ','Add: ', - $ 'Triad: '/ - DATA bytes/2,2,3,3/ + avgtime = 0 + mintime = huge(0) + maxtime = 0 + label = ['Copy: ','Scale: ','Add: ','Triad: '] + bytes = [2,2,3,3] C .. * --- SETUP --- determine precision and check timing --- - nbpw = realsize() + allocate(a(ndim), b(ndim), c(ndim)) + + call system_clock(COUNT_RATE=tick_rate) +C set timing to max precision, typically sub-microsecond + + nbpw = storage_size(a)/8 PRINT *,'----------------------------------------------' PRINT *,'STREAM Version $Revision: 5.6 $' @@ -164,62 +165,66 @@ PROGRAM stream b(j) = 0.5D0 c(j) = 0.0D0 10 CONTINUE - t = mysecond() + call system_clock(count=tic) !$OMP PARALLEL DO DO 20 j = 1,n a(j) = 0.5d0*a(j) 20 CONTINUE - t = mysecond() - t + call system_clock(count=toc) + t64 = toc - tic PRINT *,'----------------------------------------------------' - quantum = checktick() - WRITE (*,FMT=9000) - $ 'Your clock granularity/precision appears to be ',quantum, - $ ' microseconds' + + print '(a,f10.3)','Clock granularity/precision (microseconds):', + & 1/dble(tick_rate) * 1e6 PRINT *,'----------------------------------------------------' * --- MAIN LOOP --- repeat test cases NTIMES times --- scalar = 0.5d0*a(1) DO 70 k = 1,ntimes - t = mysecond() - a(1) = a(1) + t + call system_clock(count=tic) + a(1) = a(1) + tic !$OMP PARALLEL DO DO 30 j = 1,n c(j) = a(j) 30 CONTINUE - t = mysecond() - t - c(n) = c(n) + t - times(1,k) = t + call system_clock(count=toc) + t64 = toc - tic + c(n) = c(n) + t64 + times(1,k) = t64 / dble(tick_rate) - t = mysecond() - c(1) = c(1) + t + call system_clock(count=tic) + c(1) = c(1) + tic !$OMP PARALLEL DO DO 40 j = 1,n b(j) = scalar*c(j) 40 CONTINUE - t = mysecond() - t - b(n) = b(n) + t - times(2,k) = t + call system_clock(count=toc) + t64 = toc - tic + b(n) = b(n) + t64 + times(2,k) = t64 / dble(tick_rate) - t = mysecond() - a(1) = a(1) + t + call system_clock(count=tic) + a(1) = a(1) + tic !$OMP PARALLEL DO DO 50 j = 1,n c(j) = a(j) + b(j) 50 CONTINUE - t = mysecond() - t - c(n) = c(n) + t - times(3,k) = t + call system_clock(count=toc) + t64 = toc - tic + c(n) = c(n) + t64 + times(3,k) = t64 / dble(tick_rate) - t = mysecond() - b(1) = b(1) + t + call system_clock(count=tic) + b(1) = b(1) + tic !$OMP PARALLEL DO DO 60 j = 1,n a(j) = b(j) + scalar*c(j) 60 CONTINUE - t = mysecond() - t - a(n) = a(n) + t - times(4,k) = t + call system_clock(count=toc) + t64 = toc - tic + a(n) = a(n) + t64 + times(4,k) = t64 / dble(tick_rate) 70 CONTINUE * --- SUMMARY --- @@ -240,171 +245,26 @@ PROGRAM stream CALL checksums (a,b,c,n,ntimes) PRINT *,'----------------------------------------------------' - 9000 FORMAT (1x,a,i6,a) 9010 FORMAT (1x,a,i10) 9020 FORMAT (1x,a,i4,a) 9030 FORMAT (1x,a,i3,a,a) 9040 FORMAT ('Function',5x,'Rate (MB/s) Avg time Min time Max time' $ ) 9050 FORMAT (a,4 (f10.4,2x)) - END - -*------------------------------------- -* INTEGER FUNCTION dblesize() -* -* A semi-portable way to determine the precision of DOUBLE PRECISION -* in Fortran. -* Here used to guess how many bytes of storage a DOUBLE PRECISION -* number occupies. -* - INTEGER FUNCTION realsize() -* IMPLICIT NONE - -C .. Local Scalars .. - DOUBLE PRECISION result,test - INTEGER j,ndigits -C .. -C .. Local Arrays .. - DOUBLE PRECISION ref(30) -C .. -C .. External Subroutines .. - EXTERNAL confuse -C .. -C .. Intrinsic Functions .. - INTRINSIC abs,acos,log10,sqrt -C .. - -C Test #1 - compare single(1.0d0+delta) to 1.0d0 - - 10 DO 20 j = 1,30 - ref(j) = 1.0d0 + 10.0d0** (-j) - 20 CONTINUE - - DO 30 j = 1,30 - test = ref(j) - ndigits = j - CALL confuse(test,result) - IF (test.EQ.1.0D0) THEN - GO TO 40 - END IF - 30 CONTINUE - GO TO 50 - - 40 WRITE (*,FMT='(a)') - $ '----------------------------------------------' - WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ', - $ ndigits,' digits of accuracy' - IF (ndigits.LE.8) THEN - realsize = 4 - ELSE - realsize = 8 - END IF - WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize, - $ ' bytes per DOUBLE PRECISION word' - WRITE (*,FMT='(a)') - $ '----------------------------------------------' - RETURN - - 50 PRINT *,'Hmmmm. I am unable to determine the size.' - PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION', - $ ' number : ' - READ (*,FMT=*) realsize - IF (realsize.NE.4 .AND. realsize.NE.8) THEN - PRINT *,'Your answer ',realsize,' does not make sense.' - PRINT *,'Try again.' - PRINT *,'Please enter the number of Bytes per ', - $ 'DOUBLE PRECISION number : ' - READ (*,FMT=*) realsize - END IF - PRINT *,'You have manually entered a size of ',realsize, - $ ' bytes per DOUBLE PRECISION number' - WRITE (*,FMT='(a)') - $ '----------------------------------------------' - END - - SUBROUTINE confuse(q,r) -* IMPLICIT NONE -C .. Scalar Arguments .. - DOUBLE PRECISION q,r -C .. -C .. Intrinsic Functions .. - INTRINSIC cos -C .. - r = cos(q) - RETURN - END - -* A semi-portable way to determine the clock granularity -* Adapted from a code by John Henning of Digital Equipment Corporation -* - INTEGER FUNCTION checktick() -* IMPLICIT NONE - -C .. Parameters .. - INTEGER n - PARAMETER (n=20) -C .. -C .. Local Scalars .. - DOUBLE PRECISION t1,t2 - INTEGER i,j,jmin -C .. -C .. Local Arrays .. - DOUBLE PRECISION timesfound(n) -C .. -C .. External Functions .. - DOUBLE PRECISION mysecond - EXTERNAL mysecond -C .. -C .. Intrinsic Functions .. - INTRINSIC max,min,nint -C .. - i = 0 - - 10 t2 = mysecond() - IF (t2.EQ.t1) GO TO 10 - - t1 = t2 - i = i + 1 - timesfound(i) = t1 - IF (i.LT.n) GO TO 10 - - jmin = 1000000 - DO 20 i = 2,n - j = nint((timesfound(i)-timesfound(i-1))*1d6) - jmin = min(jmin,max(j,0)) - 20 CONTINUE - - IF (jmin.GT.0) THEN - checktick = jmin - ELSE - PRINT *,'Your clock granularity appears to be less ', - $ 'than one microsecond' - checktick = 1 - END IF - RETURN - -* PRINT 14, timesfound(1)*1d6 -* DO 20 i=2,n -* PRINT 14, timesfound(i)*1d6, -* & nint((timesfound(i)-timesfound(i-1))*1d6) -* 14 FORMAT (1X, F18.4, 1X, i8) -* 20 CONTINUE - - END - + contains SUBROUTINE checksums(a,b,c,n,ntimes) -* IMPLICIT NONE + IMPLICIT NONE C .. C .. Arguments .. - DOUBLE PRECISION a(*),b(*),c(*) - INTEGER n,ntimes + DOUBLE PRECISION, intent(in), dimension(:) :: a, b, c + INTEGER, intent(in) :: n, ntimes C .. C .. Local Scalars .. - DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon - INTEGER k + DOUBLE PRECISION :: aa,bb,cc,scalar,suma,sumb,sumc,epsilon + INTEGER :: k C .. C Repeat the main loop, but with scalars only. @@ -430,9 +290,9 @@ SUBROUTINE checksums(a,b,c,n,ntimes) C elements, which are modified using the timing results C to confuse aggressive optimizers. - suma = 0.0d0 - sumb = 0.0d0 - sumc = 0.0d0 + suma = 0 + sumb = 0 + sumc = 0 !$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc) DO 110 j = 2,n-1 suma = suma + a(j) @@ -442,21 +302,26 @@ SUBROUTINE checksums(a,b,c,n,ntimes) epsilon = 1.D-6 - IF (ABS(suma-aa)/suma .GT. epsilon) THEN + IF (ABS(suma-aa)/suma > epsilon) THEN PRINT *,'Failed Validation on array a()' PRINT *,'Target Sum of a is = ',aa PRINT *,'Computed Sum of a is = ',suma - ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN + error stop + ELSEIF (ABS(sumb-bb)/sumb > epsilon) THEN PRINT *,'Failed Validation on array b()' PRINT *,'Target Sum of b is = ',bb PRINT *,'Computed Sum of b is = ',sumb - ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN + error stop + ELSEIF (ABS(sumc-cc)/sumc > epsilon) THEN PRINT *,'Failed Validation on array c()' PRINT *,'Target Sum of c is = ',cc PRINT *,'Computed Sum of c is = ',sumc + error stop ELSE PRINT *,'Solution Validates!' ENDIF - END + END subroutine checksums + + END program stream