int main(int argc, char *argv[]) { pthread_t *threads; pthread_attr_t attr; uint32_t **ranks; void *status; #if defined(PAPI_ENABLED) && !defined(DEBUG) int num_sets; PAPI_event_set_wrapper_t* event_sets; #endif int rc; uint32_t t; printf("Optimized Stream benchmark (using SSE intrinsics)\n"); init_flush_cache_array(); malloc_arrays(argv); print_array_parameters(); select_code_variant(argv); print_code_variant_parameters(); threads = (pthread_t *) malloc(numThreads * sizeof(pthread_t)); ranks = (uint32_t **) malloc(numThreads * sizeof(uint32_t *)); #if !defined(DEBUG) #if defined(PAPI_ENABLED) papi_init(desired_events, num_desired, &event_sets, &num_sets); // initialize threaded PAPI if (PAPI_thread_init((unsigned long (*)(void)) (pthread_self)) != PAPI_OK) { printf("Error with PAPI_thread_init().\n"); exit(EXIT_FAILURE); } results = (double *) malloc(num_sets * numThreads * NUM_TRIALS * sizeof(double)); if (results==NULL) { printf("Error on array results malloc.\n"); exit(EXIT_FAILURE); } #else results = (double *) malloc(numThreads * NUM_TRIALS * sizeof(double)); if (results==NULL) { printf("Error on array results malloc.\n"); exit(EXIT_FAILURE); } #if defined(CYCLE_TIME) // calculate clock rate GET_CLOCK_RATE(results, NUM_TRIALS); median_counts_per_sec = find_median(results, NUM_TRIALS); //printf("Median ticks per second = %e\n", median_counts_per_sec); #else timer_init(); median_counts_per_sec = 1.0; #endif #endif #endif pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); barrier_init(&my_barrier, numThreads); #if defined(AFFINITY_ENABLED) Affinity_Init(); #endif // run stream tests for (t=0; t < numThreads; t++) { ranks[t] = (uint32_t *) malloc(sizeof(uint32_t)); *ranks[t] = t; } for (t=1; t < numThreads; t++) { #if defined(DEBUG) printf("Creating thread %u\n", t); #endif rc = pthread_create(&threads[t], &attr, pthreads_each, (void *) ranks[t]); if (rc) { printf("ERROR; return code from pthread_create() is %d\n", rc); exit(EXIT_FAILURE); } } pthreads_each((void *) ranks[0]); // join the other threads for (t=1; t < numThreads; t++) { pthread_join(threads[t], &status); } #if defined(PAPI_ENABLED) && !defined(DEBUG) papi_cleanup(event_sets, num_sets); #endif pthread_attr_destroy(&attr); pthread_exit(NULL); barrier_destroy(&my_barrier); free_arrays(); return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { double results[NUM_TRIALS]; #if !defined(DEBUG) #if defined(PAPI_ENABLED) int papi_setnum, num_desired, num_sets; #else double median_counts_per_sec; #endif #endif int i; printf("7-point stencil, no add, naive C code with non-periodic boundary conditions\n"); #if !defined(DEBUG) #if defined(PAPI_ENABLED) // initialize papi int desired_events[] = {PAPI_TOT_CYC, PAPI_FP_INS, PAPI_L2_DCA, PAPI_L2_DCM, PAPI_L3_DCM, PAPI_TLB_DM, PAPI_LD_INS, PAPI_SR_INS}; num_desired = 9; PAPI_event_set_wrapper_t* event_sets; papi_init(desired_events, num_desired, &event_sets, &num_sets); #else // calculate clock rate GET_CLOCK_RATE(results, NUM_TRIALS); median_counts_per_sec = find_median(results, NUM_TRIALS); #endif #endif // initialize arrays init_flush_cache_array(); malloc_grids(argv); printf("\n"); #if defined(DEBUG) init_grids(); printf("SINGLY NESTED LOOP:\n"); printf("\nGRID A BEFORE:"); print_grid(A); printf("\nGRID B BEFORE:"); print_grid(B); naive_singly_nested_loop(); printf("\nGRID A AFTER:"); print_grid(A); printf("\nGRID B AFTER:"); print_grid(B); init_grids(); printf("TRIPLY NESTED LOOPS:\n"); printf("\nGRID A BEFORE:"); print_grid(A); printf("\nGRID B BEFORE:"); print_grid(B); naive_triply_nested_loops(); printf("\nGRID A AFTER:"); print_grid(A); printf("\nGRID B AFTER:"); print_grid(B); #else #if defined(PAPI_ENABLED) printf("SINGLY NESTED LOOP:\n"); for (papi_setnum=0; papi_setnum < num_sets; papi_setnum++) { PAPI_MAKE_MEASUREMENTS(event_sets[papi_setnum].set, naive_singly_nested_loop(), NUM_TRIALS, results); print_papi_measurements(&(event_sets[papi_setnum]), results, NUM_TRIALS); } printf("\n"); printf("TRIPLY NESTED LOOPS:\n"); for (papi_setnum=0; papi_setnum < num_sets; papi_setnum++) { PAPI_MAKE_MEASUREMENTS(event_sets[papi_setnum].set, naive_triply_nested_loops(), NUM_TRIALS, results); print_papi_measurements(&(event_sets[papi_setnum]), results, NUM_TRIALS); } printf("\n"); papi_cleanup(event_sets, num_sets); #else printf("SINGLY NESTED LOOP:\n"); TIMER_MAKE_MEASUREMENTS(naive_singly_nested_loop(), results, NUM_TRIALS); print_timer_measurements(results, NUM_TRIALS, median_counts_per_sec); printf("\n"); printf("TRIPLY NESTED LOOPS:\n"); TIMER_MAKE_MEASUREMENTS(naive_triply_nested_loops(), results, NUM_TRIALS); print_timer_measurements(results, NUM_TRIALS, median_counts_per_sec); printf("\n"); printf("\n"); #endif #endif printf("\nFinal interior values: A[%lu, %lu, %lu] = %4.2e, B[%lu, %lu, %lu] = %4.2e\n", nx/2, ny/2, nz/2, A[Index3D(nx/2, ny/2, nz/2)], nx/2, ny/2, nz/2, B[Index3D(nx/2, ny/2, nz/2)]); fc_checksum(); free(A); free(B); return EXIT_SUCCESS; }