int HPCC_Stream(HPCC_Params *params, int doIO, double *copyGBs, double *scaleGBs, double *addGBs, double *triadGBs, int *failure) { int quantum; int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; FILE *outFile; double GiBs = 1073741824.0, curGBs; if (doIO) { // outFile = fopen( params->outFname, "w+" ); outFile = stdout; if (! outFile) { outFile = stderr; fprintf( outFile, "Cannot open output file.\n" ); return 1; } } // VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); /* Need 3 vectors */ // HARDCODED VectorSize // params->StreamVectorSize = VectorSize; a = HPCC_XMALLOC( double, VectorSize ); b = HPCC_XMALLOC( double, VectorSize ); c = HPCC_XMALLOC( double, VectorSize ); if (!a || !b || !c) { if (c) HPCC_free(c); if (b) HPCC_free(b); if (a) HPCC_free(a); if (doIO) { fprintf( outFile, "Failed to allocate memory (%lu).\n", VectorSize ); fflush( outFile ); fclose( outFile ); } return 1; } /* --- SETUP --- determine precision and check timing --- */ if (doIO) { fprintf (outFile, "Generated on %s\n", params->nowASCII); fprintf( outFile, HLINE); BytesPerWord = sizeof(double); fprintf( outFile, "This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); fprintf( outFile, HLINE); fprintf( outFile, "Array size = %lu, Offset = %d\n" , VectorSize, OFFSET); fprintf( outFile, "Total memory required = %.4f GiB.\n", (3.0 * BytesPerWord) * ( (double) VectorSize / GiBs)); fprintf( outFile, "Each test is run %d times, but only\n", NTIMES); fprintf( outFile, "the *best* time for each is used.\n"); fflush ( outFile); } #ifdef _OPENMP if (doIO) fprintf( outFile, HLINE); #pragma omp parallel private(k) { #pragma omp single nowait { k = omp_get_num_threads(); if (doIO) fprintf( outFile, "Number of Threads requested = %i\n",k); params->StreamThreads = k; } } #endif /* Get initial value for system clock. */ #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } if (doIO) fprintf( outFile, HLINE); if ( (quantum = checktick()) >= 1) { if (doIO) fprintf( outFile, "Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); } else { if (doIO) fprintf( outFile, "Your clock granularity appears to be " "less than one microsecond.\n"); } t = mysecond(); #ifdef _OPENMP #pragma omp parallel for #endif for (j = 0; j < VectorSize; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); if (doIO) { fprintf( outFile, "Each test below will take on the order" " of %d microseconds.\n", (int) t ); fprintf( outFile, " (= %d clock ticks)\n", (int) (t/quantum) ); fprintf( outFile, "Increase the size of the arrays if this shows that\n"); fprintf( outFile, "you are not getting at least 20 clock ticks per test.\n"); fprintf( outFile, HLINE); fprintf( outFile, "WARNING -- The above is only a rough guideline.\n"); fprintf( outFile, "For best results, please be sure you know the\n"); fprintf( outFile, "precision of your system timer.\n"); fprintf( outFile, HLINE); } /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = Mmin(mintime[j], times[j][k]); maxtime[j] = Mmax(maxtime[j], times[j][k]); } } if (doIO) fprintf( outFile, "Function Rate (GB/s) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] /= (double)(NTIMES - 1); /* note -- skip first iteration */ /* make sure no division by zero */ curGBs = (mintime[j] > 0.0 ? 1.0 / mintime[j] : -1.0); curGBs *= 1e-9 * bytes[j] * VectorSize; if (doIO) fprintf( outFile, "%s%11.4f %11.4f %11.4f %11.4f\n", label[j], curGBs, avgtime[j], mintime[j], maxtime[j]); switch (j) { case 0: *copyGBs = curGBs; break; case 1: *scaleGBs = curGBs; break; case 2: *addGBs = curGBs; break; case 3: *triadGBs = curGBs; break; } } if (doIO) fprintf( outFile, HLINE); /* --- Check Results --- */ checkSTREAMresults( outFile, doIO, failure ); if (doIO) fprintf( outFile, HLINE); HPCC_free(c); HPCC_free(b); HPCC_free(a); if (doIO) { fflush( outFile ); fclose( outFile ); } return 0; }
int HPCC_SHMEMRandomAccess(HPCC_Params *params) { s64Int i; static s64Int NumErrors, GlbNumErrors; int NumProcs, logNumProcs, MyProc; u64Int GlobalStartMyProc; int Remainder; /* Number of processors with (LocalTableSize + 1) entries */ u64Int Top; /* Number of table entries in top of Table */ s64Int LocalTableSize; /* Local table width */ u64Int MinLocalTableSize; /* Integer ratio TableSize/NumProcs */ u64Int logTableSize, TableSize; double CPUTime; /* CPU time to update table */ double RealTime; /* Real time to update table */ double TotalMem; static int sAbort, rAbort; int PowerofTwo; double timeBound = -1; /* OPTIONAL time bound for execution time */ u64Int NumUpdates_Default; /* Number of updates to table (suggested: 4x number of table entries) */ u64Int NumUpdates; /* actual number of updates to table - may be smaller than * NumUpdates_Default due to execution time bounds */ s64Int ProcNumUpdates; /* number of updates per processor */ #ifdef RA_TIME_BOUND s64Int GlbNumUpdates; /* for reduction */ #endif static long llpSync[_SHMEM_BCAST_SYNC_SIZE]; static long long int llpWrk[_SHMEM_REDUCE_SYNC_SIZE]; static long ipSync[_SHMEM_BCAST_SYNC_SIZE]; static int ipWrk[_SHMEM_REDUCE_SYNC_SIZE]; FILE *outFile = NULL; double *GUPs; double *temp_GUPs; int numthreads; for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1){ ipSync[i] = _SHMEM_SYNC_VALUE; llpSync[i] = _SHMEM_SYNC_VALUE; } params->SHMEMGUPs = -1; GUPs = ¶ms->SHMEMGUPs; NumProcs = shmem_n_pes(); MyProc = shmem_my_pe(); if (0 == MyProc) { outFile = stdout; setbuf(outFile, NULL); } params->HPLMaxProcMem = 200000; TotalMem = params->HPLMaxProcMem; /* max single node memory */ TotalMem *= NumProcs; /* max memory in NumProcs nodes */ TotalMem /= sizeof(u64Int); /* calculate TableSize --- the size of update array (must be a power of 2) */ for (TotalMem *= 0.5, logTableSize = 0, TableSize = 1; TotalMem >= 1.0; TotalMem *= 0.5, logTableSize++, TableSize <<= 1) ; /* EMPTY */ /* determine whether the number of processors is a power of 2 */ if ( (NumProcs & (NumProcs -1)) == 0) { PowerofTwo = HPCC_TRUE; Remainder = 0; Top = 0; MinLocalTableSize = (TableSize / NumProcs); LocalTableSize = MinLocalTableSize; GlobalStartMyProc = (MinLocalTableSize * MyProc); } else { if(MyProc == 0) { printf("Number of processes must be power of 2\n"); } return 0; } sAbort = 0; HPCC_Table = HPCC_XMALLOC( s64Int, LocalTableSize ); if (! HPCC_Table) sAbort = 1; shmem_barrier_all(); shmem_int_sum_to_all(&rAbort, &sAbort, 1, 0, 0, NumProcs, ipWrk, ipSync); shmem_barrier_all(); if (rAbort > 0) { if (MyProc == 0) fprintf(outFile, "Failed to allocate memory for the main table.\n"); /* check all allocations in case there are new added and their order changes */ if (HPCC_Table) HPCC_free( HPCC_Table ); goto failed_table; } params->SHMEMRandomAccess_N = (s64Int)TableSize; /* Default number of global updates to table: 4x number of table entries */ NumUpdates_Default = 4 * TableSize; ProcNumUpdates = 4*LocalTableSize; NumUpdates = NumUpdates_Default; if (MyProc == 0) { fprintf( outFile, "Running on %d processors%s\n", NumProcs, PowerofTwo ? " (PowerofTwo)" : ""); fprintf( outFile, "Total Main table size = 2^" FSTR64 " = " FSTR64 " words\n", logTableSize, TableSize ); if (PowerofTwo) fprintf( outFile, "PE Main table size = 2^" FSTR64 " = " FSTR64 " words/PE\n", (logTableSize - logNumProcs), TableSize/NumProcs ); else fprintf( outFile, "PE Main table size = (2^" FSTR64 ")/%d = " FSTR64 " words/PE MAX\n", logTableSize, NumProcs, LocalTableSize); fprintf( outFile, "Default number of updates (RECOMMENDED) = " FSTR64 "\n", NumUpdates_Default); params->SHMEMRandomAccess_ExeUpdates = NumUpdates; } /* Initialize main table */ for (i=0; i<LocalTableSize; i++) HPCC_Table[i] = i + GlobalStartMyProc; shmem_barrier_all(); RealTime = -RTSEC(); Power2NodesRandomAccessUpdate(logTableSize, TableSize, LocalTableSize, MinLocalTableSize, GlobalStartMyProc, Top, logNumProcs, NumProcs, Remainder, MyProc, ProcNumUpdates); shmem_barrier_all(); /* End timed section */ RealTime += RTSEC(); /* Print timing results */ if (MyProc == 0){ params->SHMEMRandomAccess_time = RealTime; *GUPs = 1e-9*NumUpdates / RealTime; fprintf( outFile, "Real time used = %.6f seconds\n", RealTime ); fprintf( outFile, "%.9f Billion(10^9) Updates per second [GUP/s]\n", *GUPs ); fprintf( outFile, "%.9f Billion(10^9) Updates/PE per second [GUP/s]\n", *GUPs / NumProcs ); /* No longer reporting per CPU number */ /* *GUPs /= NumProcs; */ } /* distribute result to all nodes */ temp_GUPs = GUPs; shmem_barrier_all(); shmem_broadcast64(GUPs,temp_GUPs,1,0,0,0,NumProcs,llpSync); shmem_barrier_all(); /* Verification phase */ /* Begin timing here */ RealTime = -RTSEC(); HPCC_Power2NodesSHMEMRandomAccessCheck(logTableSize, TableSize, LocalTableSize, GlobalStartMyProc, logNumProcs, NumProcs, MyProc, ProcNumUpdates, &NumErrors); shmem_barrier_all(); shmem_longlong_sum_to_all( &GlbNumErrors, &NumErrors, 1, 0,0, NumProcs,llpWrk, llpSync); shmem_barrier_all(); /* End timed section */ RealTime += RTSEC(); if(MyProc == 0){ params->SHMEMRandomAccess_CheckTime = RealTime; fprintf( outFile, "Verification: Real time used = %.6f seconds\n", RealTime); fprintf( outFile, "Found " FSTR64 " errors in " FSTR64 " locations (%s).\n", GlbNumErrors, TableSize, (GlbNumErrors <= 0.01*TableSize) ? "passed" : "failed"); if (GlbNumErrors > 0.01*TableSize) params->Failure = 1; params->SHMEMRandomAccess_Errors = (s64Int)GlbNumErrors; params->SHMEMRandomAccess_ErrorsFraction = (double)GlbNumErrors / (double)TableSize; params->SHMEMRandomAccess_Algorithm = 1; } /* End verification phase */ /* Deallocate memory (in reverse order of allocation which should help fragmentation) */ HPCC_free( HPCC_Table ); failed_table: if (0 == MyProc) if (outFile != stderr) fclose( outFile ); shmem_barrier_all(); return 0; }