| /*-----------------------------------------------------------------------*/ |
| /* Program: STREAM */ |
| /* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */ |
| /* Original code developed by John D. McCalpin */ |
| /* Programmers: John D. McCalpin */ |
| /* Joe R. Zagar */ |
| /* */ |
| /* This program measures memory transfer rates in MB/s for simple */ |
| /* computational kernels coded in C. */ |
| /*-----------------------------------------------------------------------*/ |
| /* Copyright 1991-2013: John D. McCalpin */ |
| /*-----------------------------------------------------------------------*/ |
| /* License: */ |
| /* 1. You are free to use this program and/or to redistribute */ |
| /* this program. */ |
| /* 2. You are free to modify this program for your own use, */ |
| /* including commercial use, subject to the publication */ |
| /* restrictions in item 3. */ |
| /* 3. You are free to publish results obtained from running this */ |
| /* program, or from works that you derive from this program, */ |
| /* with the following limitations: */ |
| /* 3a. In order to be referred to as "STREAM benchmark results", */ |
| /* published results must be in conformance to the STREAM */ |
| /* Run Rules, (briefly reviewed below) published at */ |
| /* http://www.cs.virginia.edu/stream/ref.html */ |
| /* and incorporated herein by reference. */ |
| /* As the copyright holder, John McCalpin retains the */ |
| /* right to determine conformity with the Run Rules. */ |
| /* 3b. Results based on modified source code or on runs not in */ |
| /* accordance with the STREAM Run Rules must be clearly */ |
| /* labelled whenever they are published. Examples of */ |
| /* proper labelling include: */ |
| /* "tuned STREAM benchmark results" */ |
| /* "based on a variant of the STREAM benchmark code" */ |
| /* Other comparable, clear, and reasonable labelling is */ |
| /* acceptable. */ |
| /* 3c. Submission of results to the STREAM benchmark web site */ |
| /* is encouraged, but not required. */ |
| /* 4. Use of this program or creation of derived works based on this */ |
| /* program constitutes acceptance of these licensing restrictions. */ |
| /* 5. Absolutely no warranty is expressed or implied. */ |
| /*-----------------------------------------------------------------------*/ |
| # include <stdio.h> |
| # include <unistd.h> |
| # include <math.h> |
| # include <float.h> |
| # include <limits.h> |
| # include <sys/time.h> |
| |
| /*----------------------------------------------------------------------- |
| * INSTRUCTIONS: |
| * |
| * 1) STREAM requires different amounts of memory to run on different |
| * systems, depending on both the system cache size(s) and the |
| * granularity of the system timer. |
| * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) |
| * to meet *both* of the following criteria: |
| * (a) Each array must be at least 4 times the size of the |
| * available cache memory. I don't worry about the difference |
| * between 10^6 and 2^20, so in practice the minimum array size |
| * is about 3.8 times the cache size. |
| * Example 1: One Xeon E3 with 8 MB L3 cache |
| * STREAM_ARRAY_SIZE should be >= 4 million, giving |
| * an array size of 30.5 MB and a total memory requirement |
| * of 91.5 MB. |
| * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) |
| * STREAM_ARRAY_SIZE should be >= 20 million, giving |
| * an array size of 153 MB and a total memory requirement |
| * of 458 MB. |
| * (b) The size should be large enough so that the 'timing calibration' |
| * output by the program is at least 20 clock-ticks. |
| * Example: most versions of Windows have a 10 millisecond timer |
| * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. |
| * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. |
| * This means the each array must be at least 1 GB, or 128M elements. |
| * |
| * Version 5.10 increases the default array size from 2 million |
| * elements to 10 million elements in response to the increasing |
| * size of L3 caches. The new default size is large enough for caches |
| * up to 20 MB. |
| * Version 5.10 changes the loop index variables from "register int" |
| * to "ssize_t", which allows array indices >2^32 (4 billion) |
| * on properly configured 64-bit systems. Additional compiler options |
| * (such as "-mcmodel=medium") may be required for large memory runs. |
| * |
| * Array size can be set at compile time without modifying the source |
| * code for the (many) compilers that support preprocessor definitions |
| * on the compile line. E.g., |
| * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M |
| * will override the default size of 10M with a new size of 100M elements |
| * per array. |
| */ |
| #ifndef STREAM_ARRAY_SIZE |
| # define STREAM_ARRAY_SIZE 10000000 |
| #endif |
| |
| /* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result |
| * for any iteration after the first, therefore the minimum value |
| * for NTIMES is 2. |
| * There are no rules on maximum allowable values for NTIMES, but |
| * values larger than the default are unlikely to noticeably |
| * increase the reported performance. |
| * NTIMES can also be set on the compile line without changing the source |
| * code using, for example, "-DNTIMES=7". |
| */ |
| #ifdef NTIMES |
| #if NTIMES<=1 |
| # define NTIMES 10 |
| #endif |
| #endif |
| #ifndef NTIMES |
| # define NTIMES 10 |
| #endif |
| |
| /* Users are allowed to modify the "OFFSET" variable, which *may* change the |
| * relative alignment of the arrays (though compilers may change the |
| * effective offset by making the arrays non-contiguous on some systems). |
| * Use of non-zero values for OFFSET can be especially helpful if the |
| * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. |
| * OFFSET can also be set on the compile line without changing the source |
| * code using, for example, "-DOFFSET=56". |
| */ |
| #ifndef OFFSET |
| # define OFFSET 0 |
| #endif |
| |
| /* |
| * 3) Compile the code with optimization. Many compilers generate |
| * unreasonably bad code before the optimizer tightens things up. |
| * If the results are unreasonably good, on the other hand, the |
| * optimizer might be too smart for me! |
| * |
| * For a simple single-core version, try compiling with: |
| * cc -O stream.c -o stream |
| * This is known to work on many, many systems.... |
| * |
| * To use multiple cores, you need to tell the compiler to obey the OpenMP |
| * directives in the code. This varies by compiler, but a common example is |
| * gcc -O -fopenmp stream.c -o stream_omp |
| * The environment variable OMP_NUM_THREADS allows runtime control of the |
| * number of threads/cores used when the resulting "stream_omp" program |
| * is executed. |
| * |
| * To run with single-precision variables and arithmetic, simply add |
| * -DSTREAM_TYPE=float |
| * to the compile line. |
| * Note that this changes the minimum array sizes required --- see (1) above. |
| * |
| * The preprocessor directive "TUNED" does not do much -- it simply causes the |
| * code to call separate functions to execute each kernel. Trivial versions |
| * of these functions are provided, but they are *not* tuned -- they just |
| * provide predefined interfaces to be replaced with tuned code. |
| * |
| * |
| * 4) Optional: Mail the results to mccalpin@cs.virginia.edu |
| * Be sure to include info that will help me understand: |
| * a) the computer hardware configuration (e.g., processor model, memory type) |
| * b) the compiler name/version and compilation flags |
| * c) any run-time information (such as OMP_NUM_THREADS) |
| * d) all of the output from the test case. |
| * |
| * Thanks! |
| * |
| *-----------------------------------------------------------------------*/ |
| |
| # define HLINE "-------------------------------------------------------------\n" |
| |
| # ifndef MIN |
| # define MIN(x,y) ((x)<(y)?(x):(y)) |
| # endif |
| # ifndef MAX |
| # define MAX(x,y) ((x)>(y)?(x):(y)) |
| # endif |
| |
| #ifndef STREAM_TYPE |
| #define STREAM_TYPE double |
| #endif |
| |
| static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], |
| b[STREAM_ARRAY_SIZE+OFFSET], |
| c[STREAM_ARRAY_SIZE+OFFSET]; |
| |
| static double avgtime[4] = {0}, maxtime[4] = {0}, |
| mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; |
| |
| static char *label[4] = {"Copy: ", "Scale: ", |
| "Add: ", "Triad: "}; |
| |
| static double bytes[4] = { |
| 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, |
| 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, |
| 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, |
| 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE |
| }; |
| |
| int checktick(void); |
| double mysecond(void); |
| extern void checkSTREAMresults(void); |
| #ifdef TUNED |
| extern void tuned_STREAM_Copy(); |
| extern void tuned_STREAM_Scale(STREAM_TYPE scalar); |
| extern void tuned_STREAM_Add(); |
| extern void tuned_STREAM_Triad(STREAM_TYPE scalar); |
| #endif |
| #ifdef _OPENMP |
| extern int omp_get_num_threads(); |
| #endif |
| int |
| main() |
| { |
| int quantum, checktick(); |
| int BytesPerWord; |
| int k; |
| ssize_t j; |
| STREAM_TYPE scalar; |
| double t, times[4][NTIMES]; |
| |
| /* --- SETUP --- determine precision and check timing --- */ |
| |
| printf(HLINE); |
| printf("STREAM version $Revision: 5.10 $\n"); |
| printf(HLINE); |
| BytesPerWord = sizeof(STREAM_TYPE); |
| printf("This system uses %d bytes per array element.\n", |
| BytesPerWord); |
| |
| printf(HLINE); |
| #ifdef N |
| printf("***** WARNING: ******\n"); |
| printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); |
| printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); |
| printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); |
| printf("***** WARNING: ******\n"); |
| #endif |
| |
| printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); |
| printf("Memory per array = %.1f MiB (= %.1f GiB).\n", |
| BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), |
| BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); |
| printf("Total memory required = %.1f MiB (= %.1f GiB).\n", |
| (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), |
| (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); |
| printf("Each kernel will be executed %d times.\n", NTIMES); |
| printf(" The *best* time for each kernel (excluding the first iteration)\n"); |
| printf(" will be used to compute the reported bandwidth.\n"); |
| |
| #ifdef _OPENMP |
| printf(HLINE); |
| #pragma omp parallel |
| { |
| #pragma omp master |
| { |
| k = omp_get_num_threads(); |
| printf ("Number of Threads requested = %i\n",k); |
| } |
| } |
| #endif |
| |
| #ifdef _OPENMP |
| k = 0; |
| #pragma omp parallel |
| #pragma omp atomic |
| k++; |
| printf ("Number of Threads counted = %i\n",k); |
| #endif |
| |
| /* Get initial value for system clock. */ |
| #ifdef _OPENMP |
| #pragma omp parallel for |
| #endif |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) { |
| a[j] = 1.0; |
| b[j] = 2.0; |
| c[j] = 0.0; |
| } |
| |
| printf(HLINE); |
| |
| if ( (quantum = checktick()) >= 1) |
| printf("Your clock granularity/precision appears to be " |
| "%d microseconds.\n", quantum); |
| else { |
| printf("Your clock granularity appears to be " |
| "less than one microsecond.\n"); |
| quantum = 1; |
| } |
| |
| t = mysecond(); |
| #ifdef _OPENMP |
| #pragma omp parallel for |
| #endif |
| for (j = 0; j < STREAM_ARRAY_SIZE; j++) |
| a[j] = 2.0E0 * a[j]; |
| t = 1.0E6 * (mysecond() - t); |
| |
| printf("Each test below will take on the order" |
| " of %d microseconds.\n", (int) t ); |
| printf(" (= %d clock ticks)\n", (int) (t/quantum) ); |
| printf("Increase the size of the arrays if this shows that\n"); |
| printf("you are not getting at least 20 clock ticks per test.\n"); |
| |
| printf(HLINE); |
| |
| printf("WARNING -- The above is only a rough guideline.\n"); |
| printf("For best results, please be sure you know the\n"); |
| printf("precision of your system timer.\n"); |
| printf(HLINE); |
| |
| /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ |
| |
| scalar = 3.0; |
| for (k=0; k<NTIMES; k++) |
| { |
| times[0][k] = mysecond(); |
| #ifdef TUNED |
| tuned_STREAM_Copy(); |
| #else |
| #ifdef _OPENMP |
| #pragma omp parallel for |
| #endif |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) |
| c[j] = a[j]; |
| #endif |
| times[0][k] = mysecond() - times[0][k]; |
| |
| times[1][k] = mysecond(); |
| #ifdef TUNED |
| tuned_STREAM_Scale(scalar); |
| #else |
| #ifdef _OPENMP |
| #pragma omp parallel for |
| #endif |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) |
| b[j] = scalar*c[j]; |
| #endif |
| times[1][k] = mysecond() - times[1][k]; |
| |
| times[2][k] = mysecond(); |
| #ifdef TUNED |
| tuned_STREAM_Add(); |
| #else |
| #ifdef _OPENMP |
| #pragma omp parallel for |
| #endif |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) |
| c[j] = a[j]+b[j]; |
| #endif |
| times[2][k] = mysecond() - times[2][k]; |
| |
| times[3][k] = mysecond(); |
| #ifdef TUNED |
| tuned_STREAM_Triad(scalar); |
| #else |
| #ifdef _OPENMP |
| #pragma omp parallel for |
| #endif |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) |
| a[j] = b[j]+scalar*c[j]; |
| #endif |
| times[3][k] = mysecond() - times[3][k]; |
| } |
| |
| /* --- SUMMARY --- */ |
| |
| for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ |
| { |
| for (j=0; j<4; j++) |
| { |
| avgtime[j] = avgtime[j] + times[j][k]; |
| mintime[j] = MIN(mintime[j], times[j][k]); |
| maxtime[j] = MAX(maxtime[j], times[j][k]); |
| } |
| } |
| |
| printf("Function Best Rate MB/s Avg time Min time Max time\n"); |
| for (j=0; j<4; j++) { |
| avgtime[j] = avgtime[j]/(double)(NTIMES-1); |
| |
| printf("%s%12.1f %11.6f %11.6f %11.6f\n", label[j], |
| 1.0E-06 * bytes[j]/mintime[j], |
| avgtime[j], |
| mintime[j], |
| maxtime[j]); |
| } |
| printf(HLINE); |
| |
| /* --- Check Results --- */ |
| checkSTREAMresults(); |
| printf(HLINE); |
| |
| return 0; |
| } |
| |
| # define M 20 |
| |
| int |
| checktick(void) |
| { |
| int i, minDelta, Delta; |
| double t1, t2, timesfound[M]; |
| |
| /* Collect a sequence of M unique time values from the system. */ |
| |
| for (i = 0; i < M; i++) { |
| t1 = mysecond(); |
| while( ((t2=mysecond()) - t1) < 1.0E-6 ) |
| ; |
| timesfound[i] = t1 = t2; |
| } |
| |
| /* |
| * Determine the minimum difference between these M values. |
| * This result will be our estimate (in microseconds) for the |
| * clock granularity. |
| */ |
| |
| minDelta = 1000000; |
| for (i = 1; i < M; i++) { |
| Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1])); |
| minDelta = MIN(minDelta, MAX(Delta,0)); |
| } |
| |
| return(minDelta); |
| } |
| |
| |
| |
| /* A gettimeofday routine to give access to the wall |
| clock timer on most UNIX-like systems. */ |
| |
| #include <sys/time.h> |
| |
| double mysecond(void) |
| { |
| struct timeval tp; |
| struct timezone tzp; |
| int i; |
| |
| i = gettimeofday(&tp,&tzp); |
| return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); |
| } |
| |
| #ifndef abs |
| #define abs(a) ((a) >= 0 ? (a) : -(a)) |
| #endif |
| void checkSTREAMresults (void) |
| { |
| STREAM_TYPE aj,bj,cj,scalar; |
| STREAM_TYPE aSumErr,bSumErr,cSumErr; |
| STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; |
| double epsilon; |
| ssize_t j; |
| int k,ierr,err; |
| |
| /* reproduce initialization */ |
| aj = 1.0; |
| bj = 2.0; |
| cj = 0.0; |
| /* a[] is modified during timing check */ |
| aj = 2.0E0 * aj; |
| /* now execute timing loop */ |
| scalar = 3.0; |
| for (k=0; k<NTIMES; k++) |
| { |
| cj = aj; |
| bj = scalar*cj; |
| cj = aj+bj; |
| aj = bj+scalar*cj; |
| } |
| |
| /* accumulate deltas between observed and expected results */ |
| aSumErr = 0.0; |
| bSumErr = 0.0; |
| cSumErr = 0.0; |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) { |
| aSumErr += abs(a[j] - aj); |
| bSumErr += abs(b[j] - bj); |
| cSumErr += abs(c[j] - cj); |
| // if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj); // MCCALPIN |
| } |
| aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE; |
| bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE; |
| cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE; |
| |
| if (sizeof(STREAM_TYPE) == 4) { |
| epsilon = 1.e-6; |
| } |
| else if (sizeof(STREAM_TYPE) == 8) { |
| epsilon = 1.e-13; |
| } |
| else { |
| printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE)); |
| epsilon = 1.e-6; |
| } |
| |
| err = 0; |
| if (abs(aAvgErr/aj) > epsilon) { |
| err++; |
| printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); |
| printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); |
| ierr = 0; |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) { |
| if (abs(a[j]/aj-1.0) > epsilon) { |
| ierr++; |
| #ifdef VERBOSE |
| if (ierr < 10) { |
| printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", |
| j,aj,a[j],abs((aj-a[j])/aAvgErr)); |
| } |
| #endif |
| } |
| } |
| printf(" For array a[], %d errors were found.\n",ierr); |
| } |
| if (abs(bAvgErr/bj) > epsilon) { |
| err++; |
| printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); |
| printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); |
| printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); |
| ierr = 0; |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) { |
| if (abs(b[j]/bj-1.0) > epsilon) { |
| ierr++; |
| #ifdef VERBOSE |
| if (ierr < 10) { |
| printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", |
| j,bj,b[j],abs((bj-b[j])/bAvgErr)); |
| } |
| #endif |
| } |
| } |
| printf(" For array b[], %d errors were found.\n",ierr); |
| } |
| if (abs(cAvgErr/cj) > epsilon) { |
| err++; |
| printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); |
| printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); |
| printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); |
| ierr = 0; |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) { |
| if (abs(c[j]/cj-1.0) > epsilon) { |
| ierr++; |
| #ifdef VERBOSE |
| if (ierr < 10) { |
| printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", |
| j,cj,c[j],abs((cj-c[j])/cAvgErr)); |
| } |
| #endif |
| } |
| } |
| printf(" For array c[], %d errors were found.\n",ierr); |
| } |
| if (err == 0) { |
| printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); |
| } |
| #ifdef VERBOSE |
| printf ("Results Validation Verbose Results: \n"); |
| printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); |
| printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); |
| printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); |
| #endif |
| } |
| |
| #ifdef TUNED |
| /* stubs for "tuned" versions of the kernels */ |
| void tuned_STREAM_Copy() |
| { |
| ssize_t j; |
| #pragma omp parallel for |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) |
| c[j] = a[j]; |
| } |
| |
| void tuned_STREAM_Scale(STREAM_TYPE scalar) |
| { |
| ssize_t j; |
| #pragma omp parallel for |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) |
| b[j] = scalar*c[j]; |
| } |
| |
| void tuned_STREAM_Add() |
| { |
| ssize_t j; |
| #pragma omp parallel for |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) |
| c[j] = a[j]+b[j]; |
| } |
| |
| void tuned_STREAM_Triad(STREAM_TYPE scalar) |
| { |
| ssize_t j; |
| #pragma omp parallel for |
| for (j=0; j<STREAM_ARRAY_SIZE; j++) |
| a[j] = b[j]+scalar*c[j]; |
| } |
| /* end of stubs for the "tuned" versions of the kernels */ |
| #endif |