Beispiel #1
0
int main(int argc, char *argv[]) {
  double results[NUM_TRIALS];
  int i, papi_setnum;
  
  // initialize papi
  int desired_events[] = {PAPI_TOT_CYC, PAPI_FP_INS, PAPI_L2_DCA, PAPI_L2_DCM, PAPI_L3_DCA, PAPI_L3_DCM, PAPI_TLB_DM, PAPI_LD_INS, PAPI_SR_INS};
  int num_desired = 9;
  PAPI_event_set_wrapper_t* event_sets;
  int num_sets;
  papi_init(desired_events, num_desired, &event_sets, &num_sets);

  // input parameters
  int log2_stanzaLength = atoi(argv[1]);
  int log2_numIterations = atoi(argv[2]);

  // compute actual values from base 2 logs
  stanzaLength = 1;
  for (i=0; i<log2_stanzaLength; i++) {
    stanzaLength *= 2;
  }

  numIterations = 1;
  for (i=0; i<log2_numIterations; i++) {
    numIterations *= 2;
  }
  
  int arrayLength = stanzaLength;

  printf("\nstanzaLength = %d\n", stanzaLength);
  printf("arrayLength = %d\n", arrayLength);
  printf("numIterations = %d\n", numIterations);
  printf("num_sets = %d\n\n", num_sets);
  
  // allocate working arrays
  A = (double *) malloc(arrayLength * sizeof(double));
  B = (double *) malloc(arrayLength * sizeof(double));

  if (A==NULL) {
    printf("Error on array A malloc.\n");
    exit(EXIT_FAILURE);
  }
  if (B==NULL) {
    printf("Error on array B malloc.\n");
    exit(EXIT_FAILURE);
  }

  // initialize arrays
  init_flush_cache_array();
  initArrays();

  for (papi_setnum=0; papi_setnum < num_sets; papi_setnum++) {
    PAPI_MAKE_MEASUREMENTS(event_sets[papi_setnum].set, cacheBenchmark(), NUM_TRIALS, results);
    print_measurements(&(event_sets[papi_setnum]), results, NUM_TRIALS);
  }

  papi_cleanup(event_sets, num_sets);

  return 0;
}
Beispiel #2
0
int main(int argc, char** argv)
{
	/* declearation */
	DT *a, b;
	long arr_bytes, arr_size, p;
	int i, samples;
	double t_start, t_end, deltaT;
	/* initialization */
	b = 0.0;
	arr_bytes = 1024 * 1024 * 1024; // 1GB
	arr_size = arr_bytes/sizeof(DT);
	samples = 3;
	/* memory allocation */
#ifndef MIC
	a = (DT *)malloc(arr_bytes);
#else
	a = (DT *)_mm_malloc(arr_bytes, 64);
#endif
	if(a==NULL)
	{
		DB(RT_LVL, "array 'a' allocation failed");
	}
	fill(a, arr_size, 5.0);
#pragma omp parallel
{
	init_flush_cache_array();
}
	
	/* measurement */
	for(i=0; i<samples; i++)
	{
#pragma omp parallel
{
	flush_cache();
}
		t_start = timer();
	#pragma omp parallel for private(p) shared(a, arr_size)
		for(p=0; p<arr_size; p++)
		{
			//b += a[p];
			a[p] = b;
		}
		b = b + 1.0;
		t_end = timer();						
		if(i==(samples-1))
		{
			deltaT = t_end - t_start;
			SAVE_DATA("%lf\t", arr_bytes/deltaT)
			printf("bw: %lf\t", arr_bytes/deltaT);
		}
	}
	b = a[0] + a[arr_size-1];
	save_results(&b, 1); // save results to avoid the aggressive optimizations	
	SAVE_DATA("\n")
	
	/* post-process */
#ifndef MIC
	if(a!=NULL) free(a);
#else
	if(a!=NULL) _mm_free(a);
#endif
	return 0;
}
Beispiel #3
0
int main(int argc, char *argv[]) {
  pthread_t *threads;
  pthread_attr_t attr;
  uint32_t **ranks;
  void *status;

#if defined(PAPI_ENABLED) && !defined(DEBUG)
  int num_sets;
  PAPI_event_set_wrapper_t* event_sets;
#endif
  int rc;
  uint32_t t;

  printf("Optimized Stream benchmark (using SSE intrinsics)\n");

  init_flush_cache_array();
  malloc_arrays(argv);
  print_array_parameters();
  select_code_variant(argv);
  print_code_variant_parameters();

  threads = (pthread_t *) malloc(numThreads * sizeof(pthread_t));
  ranks = (uint32_t **) malloc(numThreads * sizeof(uint32_t *));

#if !defined(DEBUG)
#if defined(PAPI_ENABLED)
  papi_init(desired_events, num_desired, &event_sets, &num_sets);

  // initialize threaded PAPI
  if (PAPI_thread_init((unsigned long (*)(void)) (pthread_self)) != PAPI_OK) {
    printf("Error with PAPI_thread_init().\n");
    exit(EXIT_FAILURE);
  }

  results = (double *) malloc(num_sets * numThreads * NUM_TRIALS * sizeof(double));
  if (results==NULL) {
    printf("Error on array results malloc.\n");
    exit(EXIT_FAILURE);
  }
#else
  results = (double *) malloc(numThreads * NUM_TRIALS * sizeof(double));
  if (results==NULL) {
    printf("Error on array results malloc.\n");
    exit(EXIT_FAILURE);
  }
#if defined(CYCLE_TIME)
  // calculate clock rate
  GET_CLOCK_RATE(results, NUM_TRIALS);
  median_counts_per_sec = find_median(results, NUM_TRIALS);
  //printf("Median ticks per second = %e\n", median_counts_per_sec);

#else
  timer_init();
  median_counts_per_sec = 1.0;
#endif
#endif
#endif

  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
  barrier_init(&my_barrier, numThreads);
#if defined(AFFINITY_ENABLED)
  Affinity_Init();
#endif

  // run stream tests
  for (t=0; t < numThreads; t++) {
    ranks[t] = (uint32_t *) malloc(sizeof(uint32_t));
    *ranks[t] = t;
  }

  for (t=1; t < numThreads; t++) {
#if defined(DEBUG)
    printf("Creating thread %u\n", t);
#endif
    rc = pthread_create(&threads[t], &attr, pthreads_each, (void *) ranks[t]);
    if (rc) {
      printf("ERROR; return code from pthread_create() is %d\n", rc);
      exit(EXIT_FAILURE);
    }
  }
  pthreads_each((void *) ranks[0]);

  // join the other threads
  for (t=1; t < numThreads; t++) {
    pthread_join(threads[t], &status);
  }

#if defined(PAPI_ENABLED) && !defined(DEBUG)
  papi_cleanup(event_sets, num_sets);
#endif
  pthread_attr_destroy(&attr);
  pthread_exit(NULL);
  barrier_destroy(&my_barrier);
  free_arrays();

  return EXIT_SUCCESS;
}
Beispiel #4
0
int main(int argc, char *argv[]) {
  double results[NUM_TRIALS];
#if !defined(DEBUG)
#if defined(PAPI_ENABLED)
  int papi_setnum, num_desired, num_sets;
#else
  double median_counts_per_sec;
#endif
#endif
  int i;

  printf("7-point stencil, no add, naive C code with non-periodic boundary conditions\n");

#if !defined(DEBUG)
#if defined(PAPI_ENABLED)
  // initialize papi
  int desired_events[] = {PAPI_TOT_CYC, PAPI_FP_INS, PAPI_L2_DCA, PAPI_L2_DCM, PAPI_L3_DCM, PAPI_TLB_DM, PAPI_LD_INS, PAPI_SR_INS};
  num_desired = 9;
  PAPI_event_set_wrapper_t* event_sets;
  papi_init(desired_events, num_desired, &event_sets, &num_sets);
#else
  // calculate clock rate
  GET_CLOCK_RATE(results, NUM_TRIALS);
  median_counts_per_sec = find_median(results, NUM_TRIALS);
#endif
#endif

  // initialize arrays
  init_flush_cache_array();
  malloc_grids(argv);
  printf("\n");

#if defined(DEBUG)
  init_grids();
  printf("SINGLY NESTED LOOP:\n");
  printf("\nGRID A BEFORE:");
  print_grid(A);
  printf("\nGRID B BEFORE:");
  print_grid(B);

  naive_singly_nested_loop();

  printf("\nGRID A AFTER:");
  print_grid(A);
  printf("\nGRID B AFTER:");
  print_grid(B);

  init_grids();
  printf("TRIPLY NESTED LOOPS:\n");
  printf("\nGRID A BEFORE:");
  print_grid(A);
  printf("\nGRID B BEFORE:");
  print_grid(B);

  naive_triply_nested_loops();

  printf("\nGRID A AFTER:");
  print_grid(A);
  printf("\nGRID B AFTER:");
  print_grid(B);
#else
#if defined(PAPI_ENABLED)
  printf("SINGLY NESTED LOOP:\n");
  for (papi_setnum=0; papi_setnum < num_sets; papi_setnum++) {
    PAPI_MAKE_MEASUREMENTS(event_sets[papi_setnum].set, naive_singly_nested_loop(), NUM_TRIALS, results);
    print_papi_measurements(&(event_sets[papi_setnum]), results, NUM_TRIALS);
  }
  printf("\n");
  printf("TRIPLY NESTED LOOPS:\n");
  for (papi_setnum=0; papi_setnum < num_sets; papi_setnum++) {
    PAPI_MAKE_MEASUREMENTS(event_sets[papi_setnum].set, naive_triply_nested_loops(), NUM_TRIALS, results);
    print_papi_measurements(&(event_sets[papi_setnum]), results, NUM_TRIALS);
  }
  printf("\n");
  papi_cleanup(event_sets, num_sets);
#else
  printf("SINGLY NESTED LOOP:\n");
  TIMER_MAKE_MEASUREMENTS(naive_singly_nested_loop(), results, NUM_TRIALS);
  print_timer_measurements(results, NUM_TRIALS, median_counts_per_sec);
  printf("\n");
  printf("TRIPLY NESTED LOOPS:\n");
  TIMER_MAKE_MEASUREMENTS(naive_triply_nested_loops(), results, NUM_TRIALS);
  print_timer_measurements(results, NUM_TRIALS, median_counts_per_sec);
  printf("\n");
  printf("\n");
#endif
#endif

  printf("\nFinal interior values: A[%lu, %lu, %lu] = %4.2e, B[%lu, %lu, %lu] = %4.2e\n", nx/2, ny/2, nz/2, A[Index3D(nx/2, ny/2, nz/2)], nx/2, ny/2, nz/2, B[Index3D(nx/2, ny/2, nz/2)]);
  fc_checksum();
  free(A);
  free(B);

  return EXIT_SUCCESS;
}