| *======================================================================= |
| * Program: STREAM |
| * Programmer: John D. McCalpin |
| *----------------------------------------------------------------------- |
| * Copyright 1991-2003: John D. McCalpin |
| *----------------------------------------------------------------------- |
| * License: |
| * 1. You are free to use this program and/or to redistribute |
| * this program. |
| * 2. You are free to modify this program for your own use, |
| * including commercial use, subject to the publication |
| * restrictions in item 3. |
| * 3. You are free to publish results obtained from running this |
| * program, or from works that you derive from this program, |
| * with the following limitations: |
| * 3a. In order to be referred to as "STREAM benchmark results", |
| * published results must be in conformance to the STREAM |
| * Run Rules, (briefly reviewed below) published at |
| * http://www.cs.virginia.edu/stream/ref.html |
| * and incorporated herein by reference. |
| * As the copyright holder, John McCalpin retains the |
| * right to determine conformity with the Run Rules. |
| * 3b. Results based on modified source code or on runs not in |
| * accordance with the STREAM Run Rules must be clearly |
| * labelled whenever they are published. Examples of |
| * proper labelling include: |
| * "tuned STREAM benchmark results" |
| * "based on a variant of the STREAM benchmark code" |
| * Other comparable, clear and reasonable labelling is |
| * acceptable. |
| * 3c. Submission of results to the STREAM benchmark web site |
| * is encouraged, but not required. |
| * 4. Use of this program or creation of derived works based on this |
| * program constitutes acceptance of these licensing restrictions. |
| * 5. Absolutely no warranty is expressed or implied. |
| *----------------------------------------------------------------------- |
| * This program measures sustained memory transfer rates in MB/s for |
| * simple computational kernels coded in FORTRAN. |
| * |
| * The intent is to demonstrate the extent to which ordinary user |
| * code can exploit the main memory bandwidth of the system under |
| * test. |
| *======================================================================= |
| * The STREAM web page is at: |
| * http://www.streambench.org |
| * |
| * Most of the content is currently hosted at: |
| * http://www.cs.virginia.edu/stream/ |
| * |
| * BRIEF INSTRUCTIONS: |
| * 0) See http://www.cs.virginia.edu/stream/ref.html for details |
| * 1) STREAM requires a timing function called mysecond(). |
| * Several examples are provided in this directory. |
| * "CPU" timers are only allowed for uniprocessor runs. |
| * "Wall-clock" timers are required for all multiprocessor runs. |
| * 2) The STREAM array sizes must be set to size the test. |
| * The value "N" must be chosen so that each of the three |
| * arrays is at least 4x larger than the sum of all the last- |
| * level caches used in the run, or 1 million elements, which- |
| * ever is larger. |
| * ------------------------------------------------------------ |
| * Note that you are free to use any array length and offset |
| * that makes each array 4x larger than the last-level cache. |
| * The intent is to determine the *best* sustainable bandwidth |
| * available with this simple coding. Of course, lower values |
| * are usually fairly easy to obtain on cached machines, but |
| * by keeping the test to the *best* results, the answers are |
| * easier to interpret. |
| * You may put the arrays in common or not, at your discretion. |
| * There is a commented-out COMMON statement below. |
| * Fortran90 "allocatable" arrays are fine, too. |
| * ------------------------------------------------------------ |
| * 3) Compile the code with full optimization. Many compilers |
| * generate unreasonably bad code before the optimizer tightens |
| * things up. If the results are unreasonably good, on the |
| * other hand, the optimizer might be too smart for me |
| * Please let me know if this happens. |
| * 4) Mail the results to mccalpin@cs.virginia.edu |
| * Be sure to include: |
| * a) computer hardware model number and software revision |
| * b) the compiler flags |
| * c) all of the output from the test case. |
| * Please let me know if you do not want your name posted along |
| * with the submitted results. |
| * 5) See the web page for more comments about the run rules and |
| * about interpretation of the results. |
| * |
| * Thanks, |
| * Dr. Bandwidth |
| *========================================================================= |
| * |
| PROGRAM stream |
| * IMPLICIT NONE |
| C .. Parameters .. |
| INTEGER n,offset,ndim,ntimes |
| PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10) |
| C .. |
| C .. Local Scalars .. |
| DOUBLE PRECISION scalar,t |
| INTEGER j,k,nbpw,quantum |
| C .. |
| C .. Local Arrays .. |
| DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4), |
| $ times(4,ntimes) |
| INTEGER bytes(4) |
| CHARACTER label(4)*11 |
| C .. |
| C .. External Functions .. |
| DOUBLE PRECISION mysecond |
| INTEGER checktick,realsize |
| EXTERNAL mysecond,checktick,realsize |
| !$ INTEGER omp_get_num_threads |
| !$ EXTERNAL omp_get_num_threads |
| C .. |
| C .. Intrinsic Functions .. |
| C |
| INTRINSIC dble,max,min,nint,sqrt |
| C .. |
| C .. Arrays in Common .. |
| DOUBLE PRECISION a(ndim),b(ndim),c(ndim) |
| C .. |
| C .. Common blocks .. |
| * COMMON a,b,c |
| C .. |
| C .. Data statements .. |
| DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/ |
| DATA label/'Copy: ','Scale: ','Add: ', |
| $ 'Triad: '/ |
| DATA bytes/2,2,3,3/ |
| C .. |
| |
| * --- SETUP --- determine precision and check timing --- |
| |
| nbpw = realsize() |
| |
| PRINT *,'----------------------------------------------' |
| PRINT *,'STREAM Version $Revision$' |
| PRINT *,'----------------------------------------------' |
| WRITE (*,FMT=9010) 'Array size = ',n |
| WRITE (*,FMT=9010) 'Offset = ',offset |
| WRITE (*,FMT=9020) 'The total memory requirement is ', |
| $ 3*nbpw*n/ (1024*1024),' MB' |
| WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times' |
| WRITE (*,FMT=9030) '--' |
| WRITE (*,FMT=9030) 'The *best* time for each test is used' |
| WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations' |
| |
| !$OMP PARALLEL |
| !$OMP MASTER |
| PRINT *,'----------------------------------------------' |
| !$ PRINT *,'Number of Threads = ',OMP_GET_NUM_THREADS() |
| !$OMP END MASTER |
| !$OMP END PARALLEL |
| |
| PRINT *,'----------------------------------------------' |
| !$OMP PARALLEL |
| PRINT *,'Printing one line per active thread....' |
| !$OMP END PARALLEL |
| |
| !$OMP PARALLEL DO |
| DO 10 j = 1,n |
| a(j) = 2.0d0 |
| b(j) = 0.5D0 |
| c(j) = 0.0D0 |
| 10 CONTINUE |
| t = mysecond() |
| !$OMP PARALLEL DO |
| DO 20 j = 1,n |
| a(j) = 0.5d0*a(j) |
| 20 CONTINUE |
| t = mysecond() - t |
| PRINT *,'----------------------------------------------------' |
| quantum = checktick() |
| WRITE (*,FMT=9000) |
| $ 'Your clock granularity/precision appears to be ',quantum, |
| $ ' microseconds' |
| PRINT *,'----------------------------------------------------' |
| |
| * --- MAIN LOOP --- repeat test cases NTIMES times --- |
| scalar = 0.5d0*a(1) |
| DO 70 k = 1,ntimes |
| |
| t = mysecond() |
| a(1) = a(1) + t |
| !$OMP PARALLEL DO |
| DO 30 j = 1,n |
| c(j) = a(j) |
| 30 CONTINUE |
| t = mysecond() - t |
| c(n) = c(n) + t |
| times(1,k) = t |
| |
| t = mysecond() |
| c(1) = c(1) + t |
| !$OMP PARALLEL DO |
| DO 40 j = 1,n |
| b(j) = scalar*c(j) |
| 40 CONTINUE |
| t = mysecond() - t |
| b(n) = b(n) + t |
| times(2,k) = t |
| |
| t = mysecond() |
| a(1) = a(1) + t |
| !$OMP PARALLEL DO |
| DO 50 j = 1,n |
| c(j) = a(j) + b(j) |
| 50 CONTINUE |
| t = mysecond() - t |
| c(n) = c(n) + t |
| times(3,k) = t |
| |
| t = mysecond() |
| b(1) = b(1) + t |
| !$OMP PARALLEL DO |
| DO 60 j = 1,n |
| a(j) = b(j) + scalar*c(j) |
| 60 CONTINUE |
| t = mysecond() - t |
| a(n) = a(n) + t |
| times(4,k) = t |
| 70 CONTINUE |
| |
| * --- SUMMARY --- |
| DO 90 k = 2,ntimes |
| DO 80 j = 1,4 |
| avgtime(j) = avgtime(j) + times(j,k) |
| mintime(j) = min(mintime(j),times(j,k)) |
| maxtime(j) = max(maxtime(j),times(j,k)) |
| 80 CONTINUE |
| 90 CONTINUE |
| WRITE (*,FMT=9040) |
| DO 100 j = 1,4 |
| avgtime(j) = avgtime(j)/dble(ntimes-1) |
| WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6, |
| $ avgtime(j),mintime(j),maxtime(j) |
| 100 CONTINUE |
| PRINT *,'----------------------------------------------------' |
| CALL checksums (a,b,c,n,ntimes) |
| PRINT *,'----------------------------------------------------' |
| |
| 9000 FORMAT (1x,a,i6,a) |
| 9010 FORMAT (1x,a,i10) |
| 9020 FORMAT (1x,a,i4,a) |
| 9030 FORMAT (1x,a,i3,a,a) |
| 9040 FORMAT ('Function',5x,'Rate (MB/s) Avg time Min time Max time' |
| $ ) |
| 9050 FORMAT (a,4 (f10.4,2x)) |
| END |
| |
| *------------------------------------- |
| * INTEGER FUNCTION dblesize() |
| * |
| * A semi-portable way to determine the precision of DOUBLE PRECISION |
| * in Fortran. |
| * Here used to guess how many bytes of storage a DOUBLE PRECISION |
| * number occupies. |
| * |
| INTEGER FUNCTION realsize() |
| * IMPLICIT NONE |
| |
| C .. Local Scalars .. |
| DOUBLE PRECISION result,test |
| INTEGER j,ndigits |
| C .. |
| C .. Local Arrays .. |
| DOUBLE PRECISION ref(30) |
| C .. |
| C .. External Subroutines .. |
| EXTERNAL confuse |
| C .. |
| C .. Intrinsic Functions .. |
| INTRINSIC abs,acos,log10,sqrt |
| C .. |
| |
| C Test #1 - compare single(1.0d0+delta) to 1.0d0 |
| |
| 10 DO 20 j = 1,30 |
| ref(j) = 1.0d0 + 10.0d0** (-j) |
| 20 CONTINUE |
| |
| DO 30 j = 1,30 |
| test = ref(j) |
| ndigits = j |
| CALL confuse(test,result) |
| IF (test.EQ.1.0D0) THEN |
| GO TO 40 |
| END IF |
| 30 CONTINUE |
| GO TO 50 |
| |
| 40 WRITE (*,FMT='(a)') |
| $ '----------------------------------------------' |
| WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ', |
| $ ndigits,' digits of accuracy' |
| IF (ndigits.LE.8) THEN |
| realsize = 4 |
| ELSE |
| realsize = 8 |
| END IF |
| WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize, |
| $ ' bytes per DOUBLE PRECISION word' |
| WRITE (*,FMT='(a)') |
| $ '----------------------------------------------' |
| RETURN |
| |
| 50 PRINT *,'Hmmmm. I am unable to determine the size.' |
| PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION', |
| $ ' number : ' |
| READ (*,FMT=*) realsize |
| IF (realsize.NE.4 .AND. realsize.NE.8) THEN |
| PRINT *,'Your answer ',realsize,' does not make sense.' |
| PRINT *,'Try again.' |
| PRINT *,'Please enter the number of Bytes per ', |
| $ 'DOUBLE PRECISION number : ' |
| READ (*,FMT=*) realsize |
| END IF |
| PRINT *,'You have manually entered a size of ',realsize, |
| $ ' bytes per DOUBLE PRECISION number' |
| WRITE (*,FMT='(a)') |
| $ '----------------------------------------------' |
| END |
| |
| SUBROUTINE confuse(q,r) |
| * IMPLICIT NONE |
| C .. Scalar Arguments .. |
| DOUBLE PRECISION q,r |
| C .. |
| C .. Intrinsic Functions .. |
| INTRINSIC cos |
| C .. |
| r = cos(q) |
| RETURN |
| END |
| |
| * A semi-portable way to determine the clock granularity |
| * Adapted from a code by John Henning of Digital Equipment Corporation |
| * |
| INTEGER FUNCTION checktick() |
| * IMPLICIT NONE |
| |
| C .. Parameters .. |
| INTEGER n |
| PARAMETER (n=20) |
| C .. |
| C .. Local Scalars .. |
| DOUBLE PRECISION t1,t2 |
| INTEGER i,j,jmin |
| C .. |
| C .. Local Arrays .. |
| DOUBLE PRECISION timesfound(n) |
| C .. |
| C .. External Functions .. |
| DOUBLE PRECISION mysecond |
| EXTERNAL mysecond |
| C .. |
| C .. Intrinsic Functions .. |
| INTRINSIC max,min,nint |
| C .. |
| i = 0 |
| |
| 10 t2 = mysecond() |
| IF (t2.EQ.t1) GO TO 10 |
| |
| t1 = t2 |
| i = i + 1 |
| timesfound(i) = t1 |
| IF (i.LT.n) GO TO 10 |
| |
| jmin = 1000000 |
| DO 20 i = 2,n |
| j = nint((timesfound(i)-timesfound(i-1))*1d6) |
| jmin = min(jmin,max(j,0)) |
| 20 CONTINUE |
| |
| IF (jmin.GT.0) THEN |
| checktick = jmin |
| ELSE |
| PRINT *,'Your clock granularity appears to be less ', |
| $ 'than one microsecond' |
| checktick = 1 |
| END IF |
| RETURN |
| |
| * PRINT 14, timesfound(1)*1d6 |
| * DO 20 i=2,n |
| * PRINT 14, timesfound(i)*1d6, |
| * & nint((timesfound(i)-timesfound(i-1))*1d6) |
| * 14 FORMAT (1X, F18.4, 1X, i8) |
| * 20 CONTINUE |
| |
| END |
| |
| |
| |
| |
| SUBROUTINE checksums(a,b,c,n,ntimes) |
| * IMPLICIT NONE |
| C .. |
| C .. Arguments .. |
| DOUBLE PRECISION a(*),b(*),c(*) |
| INTEGER n,ntimes |
| C .. |
| C .. Local Scalars .. |
| DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon |
| INTEGER k |
| C .. |
| |
| C Repeat the main loop, but with scalars only. |
| C This is done to check the sum & make sure all |
| C iterations have been executed correctly. |
| |
| aa = 2.0D0 |
| bb = 0.5D0 |
| cc = 0.0D0 |
| aa = 0.5D0*aa |
| scalar = 0.5d0*aa |
| DO k = 1,ntimes |
| cc = aa |
| bb = scalar*cc |
| cc = aa + bb |
| aa = bb + scalar*cc |
| END DO |
| aa = aa*DBLE(n-2) |
| bb = bb*DBLE(n-2) |
| cc = cc*DBLE(n-2) |
| |
| C Now sum up the arrays, excluding the first and last |
| C elements, which are modified using the timing results |
| C to confuse aggressive optimizers. |
| |
| suma = 0.0d0 |
| sumb = 0.0d0 |
| sumc = 0.0d0 |
| !$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc) |
| DO 110 j = 2,n-1 |
| suma = suma + a(j) |
| sumb = sumb + b(j) |
| sumc = sumc + c(j) |
| 110 CONTINUE |
| |
| epsilon = 1.D-6 |
| |
| IF (ABS(suma-aa)/suma .GT. epsilon) THEN |
| PRINT *,'Failed Validation on array a()' |
| PRINT *,'Target Sum of a is = ',aa |
| PRINT *,'Computed Sum of a is = ',suma |
| ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN |
| PRINT *,'Failed Validation on array b()' |
| PRINT *,'Target Sum of b is = ',bb |
| PRINT *,'Computed Sum of b is = ',sumb |
| ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN |
| PRINT *,'Failed Validation on array c()' |
| PRINT *,'Target Sum of c is = ',cc |
| PRINT *,'Computed Sum of c is = ',sumc |
| ELSE |
| PRINT *,'Solution Validates!' |
| ENDIF |
| |
| END |
| |