//////////////////////////////////////////////////////////////////////////// // This function is executed at library load time. // Initilize HBW arena by making a dummy allocation/free at library load // time. Until HBW initialization is complete, we must not call any // allocation routines with HBW as kind. //////////////////////////////////////////////////////////////////////////// void __attribute__ ((constructor)) autohbw_load(void) { // First set the default memory type this library allocates. This can // be overridden by env variable // Note: 'memkind_hbw_preferred' will allow falling back to DDR but // 'memkind_hbw will not' // Note: If HBM is not installed on a system, memkind_hbw_preferred call // woudl fail. Therefore, we need to check for availability first. // int ret = 0; if (memkind_check_available(MEMKIND_HBW) == 0) { ret = memkind_get_kind_by_name("memkind_hbw_preferred", &HBW_Type); } else { printf("WARN: *** No HBM found in system. Will use default (DDR) " "OR user specifid type ***\n"); ret = memkind_get_kind_by_name("memkind_default", &HBW_Type); } assert(!ret && "FATAL: Could not find default memory type\n"); // Read any env variables. This has to be done first because DbgLevel // is set using env variables and debug printing is used below // setEnvValues(); // read any env variables DBG(1) printf("INFO: autohbw.so loaded!\n"); // dummy HBW call to initialize HBW arena // void *pp = memkind_malloc(HBW_Type, 16); // if (pp) { // We have successfully initilized HBW arena // DBG(2) printf("\t-HBW int call succeeded\n"); memkind_free(0, pp); MemkindInitDone = TRUE; // enable HBW allocation } else { errPrn("\t-HBW init call FAILED. Is required memory type present on your system?\n"); assert(0 && "HBW/memkind initialization faild"); } }
int main(int argc, char **argv) { int quantum, checktick(); int BytesPerWord; int k; ssize_t j; STREAM_TYPE scalar; double t, times[4][NTIMES]; #ifdef ENABLE_DYNAMIC_ALLOC int err = 0; memkind_t kind; char err_msg[ERR_MSG_SIZE]; if (argc > 1 && (strncmp("--help", argv[1], strlen("--help")) == 0 || strncmp("-h", argv[1], strlen("-h")) == 0)) { printf("Usage: %s [memkind_default | memkind_hbw | memkind_hbw_hugetlb | \n" " memkind_hbw_preferred | memkind_hbw_preferred_hugetlb | \n" " memkind_hbw_gbtlb | memkind_hbw_preferred_gbtlb | memkind_gbtlb | \n" " memkind_hbw_interleave | memkind_interleave]\n", argv[0]); return 0; } #endif /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); printf("STREAM version $Revision: 5.10 $\n"); #ifdef ENABLE_DYNAMIC_ALLOC printf("Variant that uses the memkind library for dynamic memory allocation.\n"); #endif printf(HLINE); BytesPerWord = sizeof(STREAM_TYPE); printf("This system uses %d bytes per array element.\n", BytesPerWord); printf(HLINE); #ifdef N printf("***** WARNING: ******\n"); printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); printf("***** WARNING: ******\n"); #endif printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); printf("Memory per array = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); printf("Total memory required = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); printf("Each kernel will be executed %d times.\n", NTIMES); printf(" The *best* time for each kernel (excluding the first iteration)\n"); printf(" will be used to compute the reported bandwidth.\n"); #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printf ("Number of Threads requested = %i\n",k); } } #endif #ifdef _OPENMP k = 0; #pragma omp parallel #pragma omp atomic k++; printf ("Number of Threads counted = %i\n",k); #endif #ifdef ENABLE_DYNAMIC_ALLOC if (argc > 1) { err = memkind_get_kind_by_name(argv[1], &kind); } else { err = memkind_get_kind_by_name("memkind_default", &kind); } if (err) { memkind_error_message(err, err_msg, ERR_MSG_SIZE); fprintf(stderr, "ERROR: %s\n", err_msg); return -1; } err = memkind_posix_memalign(kind, (void **)&a, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET)); if (err) { fprintf(stderr, "ERROR: Unable to allocate stream array a\n"); return -err; } err = memkind_posix_memalign(kind, (void **)&b, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET)); if (err) { fprintf(stderr, "ERROR: Unable to allocate stream array b\n"); return -err; } err = memkind_posix_memalign(kind, (void **)&c, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET)); if (err) { fprintf(stderr, "ERROR: Unable to allocate stream array c\n"); return -err; } #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf(HLINE); if ( (quantum = checktick()) >= 1) printf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf("Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } t = mysecond(); #pragma omp parallel for for (j = 0; j < STREAM_ARRAY_SIZE; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); printf("Each test below will take on the order" " of %d microseconds.\n", (int) t ); printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } printf("Function Best Rate MB/s Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%12.1f %11.6f %11.6f %11.6f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf(HLINE); /* --- Check Results --- */ checkSTREAMresults(); printf(HLINE); #ifdef ENABLE_DYNAMIC_ALLOC memkind_free(kind, c); memkind_free(kind, b); memkind_free(kind, a); #endif return 0; }