Example #1
0
int main(int argc, char *argv[])
{
    volatile uint64_t workcnt = 0;
    int nthreads;

    debug_printf("bomptest started.\n");

    bench_init();

#if CONFIG_TRACE
    errval_t err = trace_control(TRACE_EVENT(TRACE_SUBSYS_ROUTE,
                                             TRACE_EVENT_ROUTE_BENCH_START, 0),
                                 TRACE_EVENT(TRACE_SUBSYS_ROUTE,
                                             TRACE_EVENT_ROUTE_BENCH_STOP, 0), 0);
    assert(err_is_ok(err));
#endif

    if(argc == 2) {
        nthreads = atoi(argv[1]);
        backend_span_domain(nthreads, STACK_SIZE);
        bomp_custom_init(NULL);
        omp_set_num_threads(nthreads);
    } else {
        assert(!"Specify number of threads");
    }

    trace_event(TRACE_SUBSYS_ROUTE, TRACE_EVENT_ROUTE_BENCH_START, 0);

    uint64_t start = bench_tsc();
#pragma omp parallel
    while(rdtsc() < start + 805000000ULL) {
        workcnt++;
    }
    uint64_t end = bench_tsc();

    trace_event(TRACE_SUBSYS_ROUTE, TRACE_EVENT_ROUTE_BENCH_STOP, 0);

    printf("done. time taken: %" PRIu64 " cycles.\n", end - start);

#if CONFIG_TRACE
        char *buf = malloc(4096*4096);
        trace_dump(buf, 4096*4096, NULL);
        printf("%s\n", buf);
#endif

    for(;;);
    return 0;
}
Example #2
0
int main(int argc, char *argv[])
{
        uint64_t begin, end;
        int i;
        static int a[N];

#ifndef POSIX
	bomp_custom_init();
#endif
        assert(argc == 2);
        omp_set_num_threads(atoi(argv[1]));

        for (i=0;i<N;i++) a[i]= 2*i;

        begin = rdtsc();

#pragma omp parallel for
        for (i=0;i<N;i++) a[i]= 2*i;

        end = rdtsc();

	printf("Value of sum is %d, time taken %lu\n", 0, end - begin);
}
int main(int argc, char *argv[])
{
  int nthreads = omp_get_max_threads();

  if(argc == 2) {
      nthreads = atoi(argv[1]);
      backend_span_domain(nthreads, STACK_SIZE);
      bomp_custom_init(NULL);
      omp_set_num_threads(nthreads);
  }

  printf("threads %d, CPUs %d\n", nthreads, omp_get_num_procs());

  volatile uint64_t exittime[ITERATIONS] = { 0 };

  // Do some work
#pragma omp parallel
  {
#ifdef GANG_SCHEDULING
      bomp_synchronize();
#endif
      for(int i = 0; i < ITERATIONS; i++) {
          uint64_t start = rdtsc();
          uint64_t workcn = 0;

          for(uint64_t n = 0;; n++) {
#pragma omp barrier
              workcn++;

              if(omp_get_thread_num() == 0 && exittime[i] == 0
                 && rdtsc() >= start + PERIOD) {
                  exittime[i] = n + 3;
              }

              if(exittime[i] != 0 && exittime[i] == n) {
                  n++;
                  break;
              }
          }

          /* char buf[64]; */
          /* sprintf(buf, "%d: %lu(%lu)\n", omp_get_thread_num(), workcn, */
          /*         stuck[omp_get_thread_num()]); */
          /* sys_print(buf, strlen(buf)); */
          /* stuck[omp_get_thread_num()] = 0; */
          workcnt[omp_get_thread_num()][i] = workcn;
      }
  }

  char buf[64];
  for(int i = 0; i < ITERATIONS; i++) {
      for(int n = 0; n < nthreads; n++) {
          sprintf(buf, "%lu ", workcnt[n][i]);
          sys_print(buf, strlen(buf));
      }
      sys_print("\n", 1);
  }

      /* sys_print("\n", 1); */

      /* char buf[128], buf1[128]; */
      /* sprintf(buf, "iterations in %lu ticks: ", PERIOD); */
      /* for(int i = 0; i < nthreads; i++) { */
      /*     sprintf(buf1, "%lu ", workcnt[i]); */
      /*     strcat(buf, buf1); */
      /* } */
      /* sprintf(buf1, "\n"); */
      /* strcat(buf, buf1); */
      /* sys_print(buf, strlen(buf)); */
  /* } */

  for(;;);
  return 0;
}
Example #4
0
/*
c   This is the serial version of the APP Benchmark 1,
c   the "embarassingly parallel" benchmark.
c
c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
c   numbers.  MK is the Log_2 of the size of each batch of uniform random
c   numbers.  MK can be set for convenience on a given system, since it does
c   not affect the results.
*/
int main(int argc, char **argv) {

    double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc;
    double dum[3] = { 1.0, 1.0, 1.0 };
    int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode,
	no_large_nodes, np_add, k_offset, j;
    int nthreads = 1;
    boolean verified;
    char size[13+1];	/* character*13 */

/*
c   Because the size of the problem is too large to store in a 32-bit
c   integer for some classes, we put it into a string (for printing).
c   Have to strip off the decimal point put in there by the floating
c   point print statement (internal file)
*/
#ifndef POSIX
#ifndef NOBOMP
    bomp_custom_init();
#endif
#endif
    omp_set_num_threads(1);
    printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
	   " - EP Benchmark\n");
    sprintf(size, "%12.0f", pow(2.0, M+1));
    for (j = 13; j >= 1; j--) {
	if (size[j] == '.') size[j] = ' ';
    }
    printf(" Number of random numbers generated: %13s\n", size);

    verified = FALSE;

/*
c   Compute the number of "batches" of random number pairs generated 
c   per processor. Adjust if the number of processors does not evenly 
c   divide the total number
*/
    np = NN;

/*
c   Call the random number generator functions and initialize
c   the x-array to reduce the effects of paging on the timings.
c   Also, call all mathematical functions that are used. Make
c   sure these initializations cannot be eliminated as dead code.
*/
    vranlc(0, &(dum[0]), dum[1], &(dum[2]));
    dum[0] = randlc(&(dum[1]), dum[2]);

    for (i = 0; i < 2*NK; i++) 
    {
	x[i] = -1.0e99;
    }        

    printf("Reached here ");
    Mops = log(sqrt(fabs(max(1.0, 1.0))));

    timer_clear(1);
    timer_clear(2);
    timer_clear(3);
    timer_start(1);

    vranlc(0, &t1, A, x);

/*   Compute AN = A ^ (2 * NK) (mod 2^46). */

    t1 = A;

    for ( i = 1; i <= MK+1; i++) {
	t2 = randlc(&t1, t1);
    }

    an = t1;
    tt = S;
    gc = 0.0;
    sx = 0.0;
    sy = 0.0;

    for ( i = 0; i <= NQ - 1; i++) {
	q[i] = 0.0;
    }
      
/*
c   Each instance of this loop may be performed independently. We compute
c   the k offsets separately to take into account the fact that some nodes
c   have more numbers to generate than others
*/
    k_offset = -1;

#pragma omp parallel copyin(x)
{
    double t1, t2, t3, t4, x1, x2;
    int kk, i, ik, l;
    double qq[NQ];		/* private copy of q[0:NQ-1] */

    for (i = 0; i < NQ; i++) qq[i] = 0.0;

#pragma omp for reduction(+:sx,sy) schedule(static)  
    for (k = 1; k <= np; k++) {
	kk = k_offset + k;
	t1 = S;
	t2 = an;

/*      Find starting seed t1 for this kk. */

	for (i = 1; i <= 100; i++) {
            ik = kk / 2;
            if (2 * ik != kk) t3 = randlc(&t1, t2);
            if (ik == 0) break;
            t3 = randlc(&t2, t2);
            kk = ik;
	}

/*      Compute uniform pseudorandom numbers. */

	if (TIMERS_ENABLED == TRUE) timer_start(3);
	vranlc(2*NK, &t1, A, x-1);
	if (TIMERS_ENABLED == TRUE) timer_stop(3);

/*
c       Compute Gaussian deviates by acceptance-rejection method and 
c       tally counts in concentric square annuli.  This loop is not 
c       vectorizable.
*/
	if (TIMERS_ENABLED == TRUE) timer_start(2);

	for ( i = 0; i < NK; i++) {
            x1 = 2.0 * x[2*i] - 1.0;
            x2 = 2.0 * x[2*i+1] - 1.0;
            t1 = pow2(x1) + pow2(x2);
            if (t1 <= 1.0) {
		t2 = sqrt(-2.0 * log(t1) / t1);
		t3 = (x1 * t2);				/* Xi */
		t4 = (x2 * t2);				/* Yi */
		l = max(fabs(t3), fabs(t4));
		qq[l] += 1.0;				/* counts */
		sx = sx + t3;				/* sum of Xi */
		sy = sy + t4;				/* sum of Yi */
            }
	}
	if (TIMERS_ENABLED == TRUE) timer_stop(2);
    }
#pragma omp critical
    {
      for (i = 0; i <= NQ - 1; i++) q[i] += qq[i];
    }
#if defined(_OPENMP)
#pragma omp master
    nthreads = omp_get_num_threads();
#endif /* _OPENMP */    
} /* end of parallel region */    

    for (i = 0; i <= NQ-1; i++) {
        gc = gc + q[i];
    }

    timer_stop(1);
    tm = timer_read(1);

    nit = 0;
    if (M == 24) {
	if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) &&
	   (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 25) {
	if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 28) {
	if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 30) {
	if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 32) {
	if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    }

    Mops = pow(2.0, M+1)/tm/1000000.0;

    printf("EP Benchmark Results: \n"
	   "CPU Time = %10.4f\n"
	   "N = 2^%5d\n"
	   "No. Gaussian Pairs = %15.0f\n"
	   "Sums = %25.15e %25.15e\n"
	   "Counts:\n",
	   tm, M, gc, sx, sy);
    for (i = 0; i  <= NQ-1; i++) {
	printf("%3d %15.0f\n", i, q[i]);
    }
	  
    c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads,
		  tm, Mops, 	
		  "Random numbers generated",
		  verified, NPBVERSION, COMPILETIME,
		  CS1, CS2, CS3, CS4, CS5, CS6, CS7);

    if (TIMERS_ENABLED == TRUE) {
	printf("Total time:     %f", timer_read(1));
	printf("Gaussian pairs: %f", timer_read(2));
	printf("Random numbers: %f", timer_read(3));
    }
}