diff --git a/stream.c b/stream.c index 9bbd6ce..4f0221a 100644 --- a/stream.c +++ b/stream.c @@ -40,8 +40,16 @@ /* program constitutes acceptance of these licensing restrictions. */ /* 5. Absolutely no warranty is expressed or implied. */ /*-----------------------------------------------------------------------*/ +#ifdef OMPGPU +#ifdef OMPGPU_UNIFIED +#pragma omp requires unified_address +#pragma omp requires unified_shared_memory +#endif +#endif + # include # include +# include # include # include # include @@ -91,7 +99,7 @@ * per array. */ #ifndef STREAM_ARRAY_SIZE -# define STREAM_ARRAY_SIZE 10000000 +# define STREAM_ARRAY_SIZE 10000000l #endif /* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result @@ -176,9 +184,9 @@ #define STREAM_TYPE double #endif -static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], - b[STREAM_ARRAY_SIZE+OFFSET], - c[STREAM_ARRAY_SIZE+OFFSET]; +static STREAM_TYPE* a = NULL; +static STREAM_TYPE* b = NULL; +static STREAM_TYPE* c = NULL; static double avgtime[4] = {0}, maxtime[4] = {0}, mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; @@ -214,6 +222,17 @@ main() STREAM_TYPE scalar; double t, times[4][NTIMES]; + a= calloc((STREAM_ARRAY_SIZE+OFFSET),sizeof(STREAM_TYPE)); + b= calloc((STREAM_ARRAY_SIZE+OFFSET),sizeof(STREAM_TYPE)); + c= calloc((STREAM_ARRAY_SIZE+OFFSET),sizeof(STREAM_TYPE)); + +#ifdef OMPGPU +#ifndef OMPGPU_UNIFIED +#pragma omp target enter data map(to:a[0:(STREAM_ARRAY_SIZE+OFFSET)]) +#pragma omp target enter data map(to:b[0:(STREAM_ARRAY_SIZE+OFFSET)]) +#pragma omp target enter data map(to:c[0:(STREAM_ARRAY_SIZE+OFFSET)]) +#endif +#endif /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); @@ -245,6 +264,9 @@ main() #ifdef _OPENMP printf(HLINE); +#ifdef OMPGPU + printf ("Using accelerator\n"); +#else #pragma omp parallel { #pragma omp master @@ -254,17 +276,24 @@ main() } } #endif +#endif +#ifndef OMPGPU #ifdef _OPENMP k = 0; #pragma omp parallel #pragma omp atomic k++; printf ("Number of Threads counted = %i\n",k); +#endif #endif /* Get initial value for system clock. */ +#ifdef OMPGPU +#pragma omp target teams distribute parallel for map(tofrom:a[0:(STREAM_ARRAY_SIZE+OFFSET)],b[0:(STREAM_ARRAY_SIZE+OFFSET)],c[0:(STREAM_ARRAY_SIZE+OFFSET)]) +#else #pragma omp parallel for +#endif for (j=0; j