tools/bin/src/stream/stream.f - hawq - Git at Google

 *=======================================================================
 * Program: STREAM
 * Programmer: John D. McCalpin
 *-----------------------------------------------------------------------
 * Copyright 1991-2003: John D. McCalpin
 *-----------------------------------------------------------------------
 * License:
 *  1. You are free to use this program and/or to redistribute
 *     this program.
 *  2. You are free to modify this program for your own use,
 *     including commercial use, subject to the publication
 *     restrictions in item 3.
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
 *     3a. In order to be referred to as "STREAM benchmark results",
 *         published results must be in conformance to the STREAM
 *         Run Rules, (briefly reviewed below) published at
 *         http://www.cs.virginia.edu/stream/ref.html
 *         and incorporated herein by reference.
 *         As the copyright holder, John McCalpin retains the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
 *         accordance with the STREAM Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
 *         "tuned STREAM benchmark results"
 *         "based on a variant of the STREAM benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
 *     3c. Submission of results to the STREAM benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
 *  5. Absolutely no warranty is expressed or implied.
 *-----------------------------------------------------------------------
 * This program measures sustained memory transfer rates in MB/s for
 * simple computational kernels coded in FORTRAN.
 *
 * The intent is to demonstrate the extent to which ordinary user
 * code can exploit the main memory bandwidth of the system under
 * test.
 *=======================================================================
 * The STREAM web page is at:
 *          http://www.streambench.org
 *
 * Most of the content is currently hosted at:
 *          http://www.cs.virginia.edu/stream/
 *
 * BRIEF INSTRUCTIONS:
 *       0) See http://www.cs.virginia.edu/stream/ref.html for details
 *       1) STREAM requires a timing function called mysecond().
 *          Several examples are provided in this directory.
 *          "CPU" timers are only allowed for uniprocessor runs.
 *          "Wall-clock" timers are required for all multiprocessor runs.
 *       2) The STREAM array sizes must be set to size the test.
 *          The value "N" must be chosen so that each of the three
 *          arrays is at least 4x larger than the sum of all the last-
 *          level caches used in the run, or 1 million elements, which-
 *          ever is larger.
 *          ------------------------------------------------------------
 *          Note that you are free to use any array length and offset
 *          that makes each array 4x larger than the last-level cache.
 *          The intent is to determine the *best* sustainable bandwidth
 *          available with this simple coding.  Of course, lower values
 *          are usually fairly easy to obtain on cached machines, but
 *          by keeping the test to the *best* results, the answers are
 *          easier to interpret.
 *          You may put the arrays in common or not, at your discretion.
 *          There is a commented-out COMMON statement below.
 *          Fortran90 "allocatable" arrays are fine, too.
 *          ------------------------------------------------------------
 *       3) Compile the code with full optimization.  Many compilers
 *          generate unreasonably bad code before the optimizer tightens
 *          things up.  If the results are unreasonably good, on the
 *          other hand, the optimizer might be too smart for me
 *          Please let me know if this happens.
 *       4) Mail the results to mccalpin@cs.virginia.edu
 *          Be sure to include:
 *               a) computer hardware model number and software revision
 *               b) the compiler flags
 *               c) all of the output from the test case.
 *          Please let me know if you do not want your name posted along
 *          with the submitted results.
 *       5) See the web page for more comments about the run rules and
 *          about interpretation of the results.
 *
 * Thanks,
 *   Dr. Bandwidth
 *=========================================================================
 *
       PROGRAM stream
 *     IMPLICIT NONE
 C     .. Parameters ..
       INTEGER n,offset,ndim,ntimes
       PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10)
 C     ..
 C     .. Local Scalars ..
       DOUBLE PRECISION scalar,t
       INTEGER j,k,nbpw,quantum
 C     ..
 C     .. Local Arrays ..
       DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4),
      $                 times(4,ntimes)
       INTEGER bytes(4)
       CHARACTER label(4)*11
 C     ..
 C     .. External Functions ..
       DOUBLE PRECISION mysecond
       INTEGER checktick,realsize
       EXTERNAL mysecond,checktick,realsize
 !$    INTEGER omp_get_num_threads
 !$    EXTERNAL omp_get_num_threads
 C     ..
 C     .. Intrinsic Functions ..
 C
       INTRINSIC dble,max,min,nint,sqrt
 C     ..
 C     .. Arrays in Common ..
       DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
 C     ..
 C     .. Common blocks ..
 *     COMMON a,b,c
 C     ..
 C     .. Data statements ..
       DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
       DATA label/'Copy:      ','Scale:     ','Add:       ',
      $     'Triad:     '/
       DATA bytes/2,2,3,3/
 C     ..

 *       --- SETUP --- determine precision and check timing ---

       nbpw = realsize()

       PRINT *,'----------------------------------------------'
       PRINT *,'STREAM Version $Revision$'
       PRINT *,'----------------------------------------------'
       WRITE (*,FMT=9010) 'Array size = ',n
       WRITE (*,FMT=9010) 'Offset     = ',offset
       WRITE (*,FMT=9020) 'The total memory requirement is ',
      $  3*nbpw*n/ (1024*1024),' MB'
       WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
       WRITE (*,FMT=9030) '--'
       WRITE (*,FMT=9030) 'The *best* time for each test is used'
       WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations'

 !$OMP PARALLEL
 !$OMP MASTER
       PRINT *,'----------------------------------------------'
 !$    PRINT *,'Number of Threads = ',OMP_GET_NUM_THREADS()
 !$OMP END MASTER
 !$OMP END PARALLEL

       PRINT *,'----------------------------------------------'
 !$OMP PARALLEL
       PRINT *,'Printing one line per active thread....'
 !$OMP END PARALLEL

 !$OMP PARALLEL DO
       DO 10 j = 1,n
           a(j) = 2.0d0
           b(j) = 0.5D0
           c(j) = 0.0D0
    10 CONTINUE
       t = mysecond()
 !$OMP PARALLEL DO
       DO 20 j = 1,n
           a(j) = 0.5d0*a(j)
    20 CONTINUE
       t = mysecond() - t
       PRINT *,'----------------------------------------------------'
       quantum = checktick()
       WRITE (*,FMT=9000)
      $  'Your clock granularity/precision appears to be ',quantum,
      $  ' microseconds'
       PRINT *,'----------------------------------------------------'

 *       --- MAIN LOOP --- repeat test cases NTIMES times ---
       scalar = 0.5d0*a(1)
       DO 70 k = 1,ntimes

           t = mysecond()
           a(1) = a(1) + t
 !$OMP PARALLEL DO
           DO 30 j = 1,n
               c(j) = a(j)
    30     CONTINUE
           t = mysecond() - t
           c(n) = c(n) + t
           times(1,k) = t

           t = mysecond()
           c(1) = c(1) + t
 !$OMP PARALLEL DO
           DO 40 j = 1,n
               b(j) = scalar*c(j)
    40     CONTINUE
           t = mysecond() - t
           b(n) = b(n) + t
           times(2,k) = t

           t = mysecond()
           a(1) = a(1) + t
 !$OMP PARALLEL DO
           DO 50 j = 1,n
               c(j) = a(j) + b(j)
    50     CONTINUE
           t = mysecond() - t
           c(n) = c(n) + t
           times(3,k) = t

           t = mysecond()
           b(1) = b(1) + t
 !$OMP PARALLEL DO
           DO 60 j = 1,n
               a(j) = b(j) + scalar*c(j)
    60     CONTINUE
           t = mysecond() - t
           a(n) = a(n) + t
           times(4,k) = t
    70 CONTINUE

 *       --- SUMMARY ---
       DO 90 k = 2,ntimes
           DO 80 j = 1,4
               avgtime(j) = avgtime(j) + times(j,k)
               mintime(j) = min(mintime(j),times(j,k))
               maxtime(j) = max(maxtime(j),times(j,k))
    80     CONTINUE
    90 CONTINUE
       WRITE (*,FMT=9040)
       DO 100 j = 1,4
           avgtime(j) = avgtime(j)/dble(ntimes-1)
           WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
      $      avgtime(j),mintime(j),maxtime(j)
   100 CONTINUE
       PRINT *,'----------------------------------------------------'
       CALL checksums (a,b,c,n,ntimes)
       PRINT *,'----------------------------------------------------'

  9000 FORMAT (1x,a,i6,a)
  9010 FORMAT (1x,a,i10)
  9020 FORMAT (1x,a,i4,a)
  9030 FORMAT (1x,a,i3,a,a)
  9040 FORMAT ('Function',5x,'Rate (MB/s)  Avg time   Min time  Max time'
      $       )
  9050 FORMAT (a,4 (f10.4,2x))
       END

 *-------------------------------------
 * INTEGER FUNCTION dblesize()
 *
 * A semi-portable way to determine the precision of DOUBLE PRECISION
 * in Fortran.
 * Here used to guess how many bytes of storage a DOUBLE PRECISION
 * number occupies.
 *
       INTEGER FUNCTION realsize()
 *     IMPLICIT NONE

 C     .. Local Scalars ..
       DOUBLE PRECISION result,test
       INTEGER j,ndigits
 C     ..
 C     .. Local Arrays ..
       DOUBLE PRECISION ref(30)
 C     ..
 C     .. External Subroutines ..
       EXTERNAL confuse
 C     ..
 C     .. Intrinsic Functions ..
       INTRINSIC abs,acos,log10,sqrt
 C     ..

 C       Test #1 - compare single(1.0d0+delta) to 1.0d0

    10 DO 20 j = 1,30
           ref(j) = 1.0d0 + 10.0d0** (-j)
    20 CONTINUE

       DO 30 j = 1,30
           test = ref(j)
           ndigits = j
           CALL confuse(test,result)
           IF (test.EQ.1.0D0) THEN
               GO TO 40
           END IF
    30 CONTINUE
       GO TO 50

    40 WRITE (*,FMT='(a)')
      $  '----------------------------------------------'
       WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
      $  ndigits,' digits of accuracy'
       IF (ndigits.LE.8) THEN
           realsize = 4
       ELSE
           realsize = 8
       END IF
       WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
      $  ' bytes per DOUBLE PRECISION word'
       WRITE (*,FMT='(a)')
      $  '----------------------------------------------'
       RETURN

    50 PRINT *,'Hmmmm.  I am unable to determine the size.'
       PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION',
      $  ' number : '
       READ (*,FMT=*) realsize
       IF (realsize.NE.4 .AND. realsize.NE.8) THEN
           PRINT *,'Your answer ',realsize,' does not make sense.'
           PRINT *,'Try again.'
           PRINT *,'Please enter the number of Bytes per ',
      $      'DOUBLE PRECISION number : '
           READ (*,FMT=*) realsize
       END IF
       PRINT *,'You have manually entered a size of ',realsize,
      $  ' bytes per DOUBLE PRECISION number'
       WRITE (*,FMT='(a)')
      $  '----------------------------------------------'
       END

       SUBROUTINE confuse(q,r)
 *     IMPLICIT NONE
 C     .. Scalar Arguments ..
       DOUBLE PRECISION q,r
 C     ..
 C     .. Intrinsic Functions ..
       INTRINSIC cos
 C     ..
       r = cos(q)
       RETURN
       END

 * A semi-portable way to determine the clock granularity
 * Adapted from a code by John Henning of Digital Equipment Corporation
 *
       INTEGER FUNCTION checktick()
 *     IMPLICIT NONE

 C     .. Parameters ..
       INTEGER n
       PARAMETER (n=20)
 C     ..
 C     .. Local Scalars ..
       DOUBLE PRECISION t1,t2
       INTEGER i,j,jmin
 C     ..
 C     .. Local Arrays ..
       DOUBLE PRECISION timesfound(n)
 C     ..
 C     .. External Functions ..
       DOUBLE PRECISION mysecond
       EXTERNAL mysecond
 C     ..
 C     .. Intrinsic Functions ..
       INTRINSIC max,min,nint
 C     ..
       i = 0

    10 t2 = mysecond()
       IF (t2.EQ.t1) GO TO 10

       t1 = t2
       i = i + 1
       timesfound(i) = t1
       IF (i.LT.n) GO TO 10

       jmin = 1000000
       DO 20 i = 2,n
           j = nint((timesfound(i)-timesfound(i-1))*1d6)
           jmin = min(jmin,max(j,0))
    20 CONTINUE

       IF (jmin.GT.0) THEN
           checktick = jmin
       ELSE
           PRINT *,'Your clock granularity appears to be less ',
      $      'than one microsecond'
           checktick = 1
       END IF
       RETURN

 *      PRINT 14, timesfound(1)*1d6
 *      DO 20 i=2,n
 *         PRINT 14, timesfound(i)*1d6,
 *     &       nint((timesfound(i)-timesfound(i-1))*1d6)
 *   14    FORMAT (1X, F18.4, 1X, i8)
 *   20 CONTINUE

       END


       SUBROUTINE checksums(a,b,c,n,ntimes)
 *     IMPLICIT NONE
 C     ..
 C     .. Arguments ..
       DOUBLE PRECISION a(*),b(*),c(*)
       INTEGER n,ntimes
 C     ..
 C     .. Local Scalars ..
       DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon
       INTEGER k
 C     ..

 C     Repeat the main loop, but with scalars only.
 C     This is done to check the sum & make sure all
 C     iterations have been executed correctly.

       aa = 2.0D0
       bb = 0.5D0
       cc = 0.0D0
       aa = 0.5D0*aa
       scalar = 0.5d0*aa
       DO k = 1,ntimes
           cc = aa
           bb = scalar*cc
           cc = aa + bb
           aa = bb + scalar*cc
       END DO
       aa = aa*DBLE(n-2)
       bb = bb*DBLE(n-2)
       cc = cc*DBLE(n-2)

 C     Now sum up the arrays, excluding the first and last
 C     elements, which are modified using the timing results
 C     to confuse aggressive optimizers.

       suma = 0.0d0
       sumb = 0.0d0
       sumc = 0.0d0
 !$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc)
       DO 110 j = 2,n-1
           suma = suma + a(j)
           sumb = sumb + b(j)
           sumc = sumc + c(j)
   110 CONTINUE

       epsilon = 1.D-6

       IF (ABS(suma-aa)/suma .GT. epsilon) THEN
           PRINT *,'Failed Validation on array a()'
           PRINT *,'Target   Sum of a is = ',aa
           PRINT *,'Computed Sum of a is = ',suma
       ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN
           PRINT *,'Failed Validation on array b()'
           PRINT *,'Target   Sum of b is = ',bb
           PRINT *,'Computed Sum of b is = ',sumb
       ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN
           PRINT *,'Failed Validation on array c()'
           PRINT *,'Target   Sum of c is = ',cc
           PRINT *,'Computed Sum of c is = ',sumc
       ELSE
           PRINT *,'Solution Validates!'
       ENDIF

       END
	*=======================================================================
	* Program: STREAM
	* Programmer: John D. McCalpin
	*-----------------------------------------------------------------------
	* Copyright 1991-2003: John D. McCalpin
	*-----------------------------------------------------------------------
	* License:
	* 1. You are free to use this program and/or to redistribute
	* this program.
	* 2. You are free to modify this program for your own use,
	* including commercial use, subject to the publication
	* restrictions in item 3.
	* 3. You are free to publish results obtained from running this
	* program, or from works that you derive from this program,
	* with the following limitations:
	* 3a. In order to be referred to as "STREAM benchmark results",
	* published results must be in conformance to the STREAM
	* Run Rules, (briefly reviewed below) published at
	* http://www.cs.virginia.edu/stream/ref.html
	* and incorporated herein by reference.
	* As the copyright holder, John McCalpin retains the
	* right to determine conformity with the Run Rules.
	* 3b. Results based on modified source code or on runs not in
	* accordance with the STREAM Run Rules must be clearly
	* labelled whenever they are published. Examples of
	* proper labelling include:
	* "tuned STREAM benchmark results"
	* "based on a variant of the STREAM benchmark code"
	* Other comparable, clear and reasonable labelling is
	* acceptable.
	* 3c. Submission of results to the STREAM benchmark web site
	* is encouraged, but not required.
	* 4. Use of this program or creation of derived works based on this
	* program constitutes acceptance of these licensing restrictions.
	* 5. Absolutely no warranty is expressed or implied.
	*-----------------------------------------------------------------------
	* This program measures sustained memory transfer rates in MB/s for
	* simple computational kernels coded in FORTRAN.
	*
	* The intent is to demonstrate the extent to which ordinary user
	* code can exploit the main memory bandwidth of the system under
	* test.
	*=======================================================================
	* The STREAM web page is at:
	* http://www.streambench.org
	*
	* Most of the content is currently hosted at:
	* http://www.cs.virginia.edu/stream/
	*
	* BRIEF INSTRUCTIONS:
	* 0) See http://www.cs.virginia.edu/stream/ref.html for details
	* 1) STREAM requires a timing function called mysecond().
	* Several examples are provided in this directory.
	* "CPU" timers are only allowed for uniprocessor runs.
	* "Wall-clock" timers are required for all multiprocessor runs.
	* 2) The STREAM array sizes must be set to size the test.
	* The value "N" must be chosen so that each of the three
	* arrays is at least 4x larger than the sum of all the last-
	* level caches used in the run, or 1 million elements, which-
	* ever is larger.
	* ------------------------------------------------------------
	* Note that you are free to use any array length and offset
	* that makes each array 4x larger than the last-level cache.
	* The intent is to determine the best sustainable bandwidth
	* available with this simple coding. Of course, lower values
	* are usually fairly easy to obtain on cached machines, but
	* by keeping the test to the best results, the answers are
	* easier to interpret.
	* You may put the arrays in common or not, at your discretion.
	* There is a commented-out COMMON statement below.
	* Fortran90 "allocatable" arrays are fine, too.
	* ------------------------------------------------------------
	* 3) Compile the code with full optimization. Many compilers
	* generate unreasonably bad code before the optimizer tightens
	* things up. If the results are unreasonably good, on the
	* other hand, the optimizer might be too smart for me
	* Please let me know if this happens.
	* 4) Mail the results to mccalpin@cs.virginia.edu
	* Be sure to include:
	* a) computer hardware model number and software revision
	* b) the compiler flags
	* c) all of the output from the test case.
	* Please let me know if you do not want your name posted along
	* with the submitted results.
	* 5) See the web page for more comments about the run rules and
	* about interpretation of the results.
	*
	* Thanks,
	* Dr. Bandwidth
	*=========================================================================
	*
	PROGRAM stream
	* IMPLICIT NONE
	C .. Parameters ..
	INTEGER n,offset,ndim,ntimes
	PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10)
	C ..
	C .. Local Scalars ..
	DOUBLE PRECISION scalar,t
	INTEGER j,k,nbpw,quantum
	C ..
	C .. Local Arrays ..
	DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4),
	$ times(4,ntimes)
	INTEGER bytes(4)
	CHARACTER label(4)*11
	C ..
	C .. External Functions ..
	DOUBLE PRECISION mysecond
	INTEGER checktick,realsize
	EXTERNAL mysecond,checktick,realsize
	!$ INTEGER omp_get_num_threads
	!$ EXTERNAL omp_get_num_threads
	C ..
	C .. Intrinsic Functions ..
	C
	INTRINSIC dble,max,min,nint,sqrt
	C ..
	C .. Arrays in Common ..
	DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
	C ..
	C .. Common blocks ..
	* COMMON a,b,c
	C ..
	C .. Data statements ..
	DATA avgtime/40.0D0/,mintime/41.0D+36/,maxtime/4*0.0D0/
	DATA label/'Copy: ','Scale: ','Add: ',
	$ 'Triad: '/
	DATA bytes/2,2,3,3/
	C ..

	* --- SETUP --- determine precision and check timing ---

	nbpw = realsize()

	PRINT *,'----------------------------------------------'
	PRINT *,'STREAM Version $Revision$'
	PRINT *,'----------------------------------------------'
	WRITE (*,FMT=9010) 'Array size = ',n
	WRITE (*,FMT=9010) 'Offset = ',offset
	WRITE (*,FMT=9020) 'The total memory requirement is ',
	$ 3nbpwn/ (1024*1024),' MB'
	WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
	WRITE (*,FMT=9030) '--'
	WRITE (,FMT=9030) 'The best* time for each test is used'
	WRITE (,FMT=9030) 'EXCLUDING* the first and last iterations'

	!$OMP PARALLEL
	!$OMP MASTER
	PRINT *,'----------------------------------------------'
	!$ PRINT *,'Number of Threads = ',OMP_GET_NUM_THREADS()
	!$OMP END MASTER
	!$OMP END PARALLEL

	PRINT *,'----------------------------------------------'
	!$OMP PARALLEL
	PRINT *,'Printing one line per active thread....'
	!$OMP END PARALLEL

	!$OMP PARALLEL DO
	DO 10 j = 1,n
	a(j) = 2.0d0
	b(j) = 0.5D0
	c(j) = 0.0D0
	10 CONTINUE
	t = mysecond()
	!$OMP PARALLEL DO
	DO 20 j = 1,n
	a(j) = 0.5d0*a(j)
	20 CONTINUE
	t = mysecond() - t
	PRINT *,'----------------------------------------------------'
	quantum = checktick()
	WRITE (*,FMT=9000)
	$ 'Your clock granularity/precision appears to be ',quantum,
	$ ' microseconds'
	PRINT *,'----------------------------------------------------'

	* --- MAIN LOOP --- repeat test cases NTIMES times ---
	scalar = 0.5d0*a(1)
	DO 70 k = 1,ntimes

	t = mysecond()
	a(1) = a(1) + t
	!$OMP PARALLEL DO
	DO 30 j = 1,n
	c(j) = a(j)
	30 CONTINUE
	t = mysecond() - t
	c(n) = c(n) + t
	times(1,k) = t

	t = mysecond()
	c(1) = c(1) + t
	!$OMP PARALLEL DO
	DO 40 j = 1,n
	b(j) = scalar*c(j)
	40 CONTINUE
	t = mysecond() - t
	b(n) = b(n) + t
	times(2,k) = t

	t = mysecond()
	a(1) = a(1) + t
	!$OMP PARALLEL DO
	DO 50 j = 1,n
	c(j) = a(j) + b(j)
	50 CONTINUE
	t = mysecond() - t
	c(n) = c(n) + t
	times(3,k) = t

	t = mysecond()
	b(1) = b(1) + t
	!$OMP PARALLEL DO
	DO 60 j = 1,n
	a(j) = b(j) + scalar*c(j)
	60 CONTINUE
	t = mysecond() - t
	a(n) = a(n) + t
	times(4,k) = t
	70 CONTINUE

	* --- SUMMARY ---
	DO 90 k = 2,ntimes
	DO 80 j = 1,4
	avgtime(j) = avgtime(j) + times(j,k)
	mintime(j) = min(mintime(j),times(j,k))
	maxtime(j) = max(maxtime(j),times(j,k))
	80 CONTINUE
	90 CONTINUE
	WRITE (*,FMT=9040)
	DO 100 j = 1,4
	avgtime(j) = avgtime(j)/dble(ntimes-1)
	WRITE (,FMT=9050) label(j),nbytes(j)*nbpw/mintime(j)/1.0D6,
	$ avgtime(j),mintime(j),maxtime(j)
	100 CONTINUE
	PRINT *,'----------------------------------------------------'
	CALL checksums (a,b,c,n,ntimes)
	PRINT *,'----------------------------------------------------'

	9000 FORMAT (1x,a,i6,a)
	9010 FORMAT (1x,a,i10)
	9020 FORMAT (1x,a,i4,a)
	9030 FORMAT (1x,a,i3,a,a)
	9040 FORMAT ('Function',5x,'Rate (MB/s) Avg time Min time Max time'
	$ )
	9050 FORMAT (a,4 (f10.4,2x))
	END

	*-------------------------------------
	* INTEGER FUNCTION dblesize()
	*
	* A semi-portable way to determine the precision of DOUBLE PRECISION
	* in Fortran.
	* Here used to guess how many bytes of storage a DOUBLE PRECISION
	* number occupies.
	*
	INTEGER FUNCTION realsize()
	* IMPLICIT NONE

	C .. Local Scalars ..
	DOUBLE PRECISION result,test
	INTEGER j,ndigits
	C ..
	C .. Local Arrays ..
	DOUBLE PRECISION ref(30)
	C ..
	C .. External Subroutines ..
	EXTERNAL confuse
	C ..
	C .. Intrinsic Functions ..
	INTRINSIC abs,acos,log10,sqrt
	C ..

	C Test #1 - compare single(1.0d0+delta) to 1.0d0

	10 DO 20 j = 1,30
	ref(j) = 1.0d0 + 10.0d0** (-j)
	20 CONTINUE

	DO 30 j = 1,30
	test = ref(j)
	ndigits = j
	CALL confuse(test,result)
	IF (test.EQ.1.0D0) THEN
	GO TO 40
	END IF
	30 CONTINUE
	GO TO 50

	40 WRITE (*,FMT='(a)')
	$ '----------------------------------------------'
	WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
	$ ndigits,' digits of accuracy'
	IF (ndigits.LE.8) THEN
	realsize = 4
	ELSE
	realsize = 8
	END IF
	WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
	$ ' bytes per DOUBLE PRECISION word'
	WRITE (*,FMT='(a)')
	$ '----------------------------------------------'
	RETURN

	50 PRINT *,'Hmmmm. I am unable to determine the size.'
	PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION',
	$ ' number : '
	READ (,FMT=) realsize
	IF (realsize.NE.4 .AND. realsize.NE.8) THEN
	PRINT *,'Your answer ',realsize,' does not make sense.'
	PRINT *,'Try again.'
	PRINT *,'Please enter the number of Bytes per ',
	$ 'DOUBLE PRECISION number : '
	READ (,FMT=) realsize
	END IF
	PRINT *,'You have manually entered a size of ',realsize,
	$ ' bytes per DOUBLE PRECISION number'
	WRITE (*,FMT='(a)')
	$ '----------------------------------------------'
	END

	SUBROUTINE confuse(q,r)
	* IMPLICIT NONE
	C .. Scalar Arguments ..
	DOUBLE PRECISION q,r
	C ..
	C .. Intrinsic Functions ..
	INTRINSIC cos
	C ..
	r = cos(q)
	RETURN
	END

	* A semi-portable way to determine the clock granularity
	* Adapted from a code by John Henning of Digital Equipment Corporation
	*
	INTEGER FUNCTION checktick()
	* IMPLICIT NONE

	C .. Parameters ..
	INTEGER n
	PARAMETER (n=20)
	C ..
	C .. Local Scalars ..
	DOUBLE PRECISION t1,t2
	INTEGER i,j,jmin
	C ..
	C .. Local Arrays ..
	DOUBLE PRECISION timesfound(n)
	C ..
	C .. External Functions ..
	DOUBLE PRECISION mysecond
	EXTERNAL mysecond
	C ..
	C .. Intrinsic Functions ..
	INTRINSIC max,min,nint
	C ..
	i = 0

	10 t2 = mysecond()
	IF (t2.EQ.t1) GO TO 10

	t1 = t2
	i = i + 1
	timesfound(i) = t1
	IF (i.LT.n) GO TO 10

	jmin = 1000000
	DO 20 i = 2,n
	j = nint((timesfound(i)-timesfound(i-1))*1d6)
	jmin = min(jmin,max(j,0))
	20 CONTINUE

	IF (jmin.GT.0) THEN
	checktick = jmin
	ELSE
	PRINT *,'Your clock granularity appears to be less ',
	$ 'than one microsecond'
	checktick = 1
	END IF
	RETURN

	* PRINT 14, timesfound(1)*1d6
	* DO 20 i=2,n
	* PRINT 14, timesfound(i)*1d6,
	* & nint((timesfound(i)-timesfound(i-1))*1d6)
	* 14 FORMAT (1X, F18.4, 1X, i8)
	* 20 CONTINUE

	END




	SUBROUTINE checksums(a,b,c,n,ntimes)
	* IMPLICIT NONE
	C ..
	C .. Arguments ..
	DOUBLE PRECISION a(),b(),c(*)
	INTEGER n,ntimes
	C ..
	C .. Local Scalars ..
	DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon
	INTEGER k
	C ..

	C Repeat the main loop, but with scalars only.
	C This is done to check the sum & make sure all
	C iterations have been executed correctly.

	aa = 2.0D0
	bb = 0.5D0
	cc = 0.0D0
	aa = 0.5D0*aa
	scalar = 0.5d0*aa
	DO k = 1,ntimes
	cc = aa
	bb = scalar*cc
	cc = aa + bb
	aa = bb + scalar*cc
	END DO
	aa = aa*DBLE(n-2)
	bb = bb*DBLE(n-2)
	cc = cc*DBLE(n-2)

	C Now sum up the arrays, excluding the first and last
	C elements, which are modified using the timing results
	C to confuse aggressive optimizers.

	suma = 0.0d0
	sumb = 0.0d0
	sumc = 0.0d0
	!$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc)
	DO 110 j = 2,n-1
	suma = suma + a(j)
	sumb = sumb + b(j)
	sumc = sumc + c(j)
	110 CONTINUE

	epsilon = 1.D-6

	IF (ABS(suma-aa)/suma .GT. epsilon) THEN
	PRINT *,'Failed Validation on array a()'
	PRINT *,'Target Sum of a is = ',aa
	PRINT *,'Computed Sum of a is = ',suma
	ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN
	PRINT *,'Failed Validation on array b()'
	PRINT *,'Target Sum of b is = ',bb
	PRINT *,'Computed Sum of b is = ',sumb
	ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN
	PRINT *,'Failed Validation on array c()'
	PRINT *,'Target Sum of c is = ',cc
	PRINT *,'Computed Sum of c is = ',sumc
	ELSE
	PRINT *,'Solution Validates!'
	ENDIF

	END