Пример #1
0
int main(int argc, char **argv) {
  int m = atoi(argv[1]);
  int k = atoi(argv[2]);
  int n = atoi(argv[3]);

  float *A = (float*) malloc(m * k * sizeof(float));
  float *B = (float*) malloc(k * n * sizeof(float));
  float *C = (float*) malloc(m * n * sizeof(float));

  int Events[] = {PAPI_FP_INS, PAPI_TOT_CYC};
  long_long values[2];
#define NUM_EVENTS 2

  initialize(m, k, n, A, B, C);

  /* Start counting events */
  if (PAPI_start_counters(Events, NUM_EVENTS) != PAPI_OK)
	  exit(10);

  multiply(m, k, n, A, B, C);

  /* Stop counting events */
  if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK)
	  exit(10);

  printf("Counter values: %ld, %ld\n", values[0], values[1]);

  free(A);
  free(B);
  free(C);
  return 0;
}
Пример #2
0
int main(int argc, char** argv) {
  int Events[NUM_EVENTS]; 
  const char* names[NUM_EVENTS] = {"OPEN_CALLS", "OPEN_FDS", "READ_CALLS", "READ_BYTES", "READ_USEC", "READ_ERR", "READ_INTERRUPTED", "READ_WOULD_BLOCK", "WRITE_CALLS","WRITE_BYTES","WRITE_USEC", "WRITE_WOULD_BLOCK"};
  long long values[NUM_EVENTS];

  /* Set TESTS_QUIET variable */
  tests_quiet( argc, argv );

  int version = PAPI_library_init (PAPI_VER_CURRENT);
  if (version != PAPI_VER_CURRENT) {
    fprintf(stderr, "PAPI_library_init version mismatch\n");
    exit(1);
  }

  if (!TESTS_QUIET) fprintf(stderr, "This program will read from stdin and echo it to stdout\n");
  int retval;
  int e;
  for (e=0; e<NUM_EVENTS; e++) {
    retval = PAPI_event_name_to_code((char*)names[e], &Events[e]);
    if (retval != PAPI_OK) {
      fprintf(stderr, "Error getting code for %s\n", names[e]);
      exit(2);
    } 
  }

  /* Start counting events */
  if (PAPI_start_counters(Events, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_start_counters\n");
    exit(1);
  }

  int bytes = 0;
  char buf[1024];

 
//if (PAPI_read_counters(values, NUM_EVENTS) != PAPI_OK)
//   handle_error(1);
//printf("After reading the counters: %lld\n",values[0]);

  while ((bytes = read(0, buf, 1024)) > 0) {
    write(1, buf, bytes);
  }


  /* Stop counting events */
  if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_stop_counters\n");
  }
 
  if (!TESTS_QUIET) { 
    printf("----\n");
    for (e=0; e<NUM_EVENTS; e++)  
      printf("%s: %lld\n", names[e], values[e]);
  }
  test_pass( __FILE__, NULL, 0 );
  return 0;
}
Пример #3
0
int main(int argc, char **argv) {

	int retval;

	retval = PAPI_library_init(PAPI_VER_CURRENT);
	if (retval != PAPI_VER_CURRENT) {
		fprintf(stderr,"Error! PAPI_library_init %d\n", retval);
	}

	retval = PAPI_query_event(PAPI_TOT_INS);
	if (retval != PAPI_OK) {
		fprintf(stderr,"PAPI_TOT_INS not supported\n");
		exit(1);
	}

	int i;
	int events[1],result;
	long long counts[1];

	long long total=0,average,max=0,min=0x7ffffffffffffffULL;

	events[0]=PAPI_TOT_INS;

	PAPI_start_counters(events,1);

	for(i=0;i<NUM_RUNS;i++) {


		result=instructions_million();

		PAPI_read_counters(counts,1);

		results[i]=counts[0];

 	}

	PAPI_stop_counters(counts,1);


	PAPI_shutdown();

	for(i=0;i<NUM_RUNS;i++) {
		total+=results[i];
		if (results[i]>max) max=results[i];
		if (results[i]<min) min=results[i];
	}

	average=total/NUM_RUNS;
	printf("Average=%lld max=%lld min=%lld\n",average,max,min);

	(void) result;

	return 0;
}
Пример #4
0
void *ThreadIO(void *arg) {
  unsigned long tid = (unsigned long)pthread_self();
  if (!TESTS_QUIET) printf("\nThread %#lx: will read %s and write it to /dev/null\n", tid,(const char*) arg);
  int Events[NUM_EVENTS]; 
  long long values[NUM_EVENTS];
  int retval;
  int e;
  for (e=0; e<NUM_EVENTS; e++) {
    retval = PAPI_event_name_to_code((char*)names[e], &Events[e]);
    if (retval != PAPI_OK) {
      fprintf(stderr, "Error getting code for %s\n", names[e]);
      exit(2);
    } 
  }

  /* Start counting events */
  if (PAPI_start_counters(Events, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_start_counters\n");
    exit(1);
  }
 
//if (PAPI_read_counters(values, NUM_EVENTS) != PAPI_OK)
//   handle_error(1);
//printf("After reading the counters: %lld\n",values[0]);

  int fdin = open((const char*)arg, O_RDONLY);
  if (fdin < 0) perror("Could not open file for reading: \n");

  int bytes = 0;
  char buf[1024];

  int fdout = open("/dev/null", O_WRONLY);
  if (fdout < 0) perror("Could not open /dev/null for writing: \n");
  while ((bytes = read(fdin, buf, 1024)) > 0) {
    write(fdout, buf, bytes);
  }
  close(fdout);

  /* Stop counting events */
  if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_stop_counters\n");
  }

  if (!TESTS_QUIET) {
    for (e=0; e<NUM_EVENTS; e++)  
      printf("Thread %#lx: %s: %lld\n", tid, names[e], values[e]);
  }
  return(NULL);
}
Пример #5
0
void papi_base::stop( ) {

	if( values_.size() == 0 ) { return; }
	if( papi_started_ ) {
		long long v[ counters_.size() ];
		int retval = PAPI_stop_counters( &v[0], counters_.size() );
		if( retval != PAPI_OK ) handle_error( retval );
		for( uint32_t i = 0; i < values_.size(); ++i ) {
			values_[ i ] += v[ i ];
		}
	}
	else {
		for ( auto it = values_.begin(); it != values_.end(); ++it ) {
			*it = -1;
		}
	}
	papi_started_ = false;
}
Пример #6
0
SEXP papi_flips_off()
{
  float real_time, proc_time, mflips;
  long long flpins;
  int retval;
  long_long values[NUM_EVENTS];

  SEXP RET;


  retval = PAPI_flips(&real_time, &proc_time, &flpins, &mflips);

  PAPI_stop_counters(values, NUM_EVENTS);

  RET = R_papi_setret(retval, real_time, proc_time, flpins, "flpins", mflips, "mflips");

  return RET;
}
Пример #7
0
//
// This method should be placed at the end of instrumented code
//
void stopPapiCounters(){
#ifdef DBG
    printf("********* STOPING COUNTERS *************\n");
#endif

    long long _G_COUNTERS[NUM_EVENTS];
    int i;
    //*******  Stop Counters ******
    assert(PAPI_stop_counters(_G_COUNTERS, NUM_EVENTS) >= PAPI_OK);
    // get the counter information for each event.
    // currently printing on stdout.
    for( i = 0; i < NUM_EVENTS; ++i ) {
        PAPI_event_info_t info;
        PAPI_get_event_info(_G_EVENTS[i], &info);
#ifdef DBG
        printf("%20lld %-15s %s\n", _G_COUNTERS[i], info.symbol, info.long_descr);
#else
        fprintf(stderr, "%lld ", _G_COUNTERS[i]);
#endif
    }
}
Пример #8
0
int main(int argc, char **argv) {

	int events[1];
	long long counts[1];

	int retval,quiet;

	char test_string[]="Testing PAPI_SYC_INS predefined event...";

	quiet=test_quiet();

	retval = PAPI_library_init(PAPI_VER_CURRENT);
	if (retval != PAPI_VER_CURRENT) {
		if (!quiet) printf("Error! PAPI_library_init %d\n",retval);
		test_fail(test_string);
	}

	retval = PAPI_query_event(PAPI_SYC_INS);
	if (retval != PAPI_OK) {
		if (!quiet) printf("PAPI_SYC_INS not available\n");
		test_skip(test_string);
	}

	events[0]=PAPI_SYC_INS;

	PAPI_start_counters(events,1);

	PAPI_stop_counters(counts,1);

	if (counts[0]<1) {
		if (!quiet) printf("Error! Count too low\n");
		test_fail(test_string);
	}

	PAPI_shutdown();

	test_unimplemented(test_string);

	return 0;
}
Пример #9
0
JNIEXPORT jint JNICALL Java_papi_Wrapper_stopCounters
		(JNIEnv *env, jobject UNUSED_ARG(self), jlongArray valuesarr) {
	if (valuesarr == NULL) {
		return PAPI_EINVAL;
	}

	int values_count = (*env)->GetArrayLength(env, valuesarr);
	if (values_count == 0) {
		return PAPI_EINVAL;
	}

	jlong *valuesj = (*env)->GetLongArrayElements(env, valuesarr, NULL);
	long long *values = (long long *) valuesj;

	int rc = PAPI_stop_counters(values, values_count);

	if (rc == PAPI_OK) {
		(*env)->ReleaseLongArrayElements(env, valuesarr, valuesj, JNI_COMMIT);
	} else {
		(*env)->ReleaseLongArrayElements(env, valuesarr, valuesj, JNI_ABORT);
	}

	return rc;
}
Пример #10
0
void
papi_set_events(char *metric)
{
  const size_t n = 1;

  int max;
  long_long *papi_tmp;
  int papi_events[1];
  int code;

  max = PAPI_num_counters();

  if (n > max)
    papi_eprintf("Too many counters requested.\n");

  papi_tmp = malloc(sizeof(*papi_tmp) * n);

  PAPI_reset(max);

  PAPI_stop_counters(papi_tmp, n);

  if (PAPI_event_name_to_code(metric, &code) != PAPI_OK)
    papi_eprintf("Unknown PAPI event %s.\n", metric);

  if (code == 0)
    papi_eprintf("Unknown PAPI event %s.\n", metric);

  papi_events[0] = code;

  PAPI_start_counters(papi_events, n);

  if (PAPI_read_counters(papi_tmp, n) != PAPI_OK)
    papi_eprintf("Problem reading counters %s:%d.\n", __FILE__, __LINE__);

  free(papi_tmp);
}
Пример #11
0
void my_papi_stop(int *events, int NUM_EVENTS)
{
	int j;
	/* Read the counters */
	if (PAPI_read_counters(values, NUM_EVENTS) != PAPI_OK) {
			fprintf(stderr, "PAPI_read_counters - FAILED\n");
			exit(1);
	}

	for (j=0; j<NUM_EVENTS; j++) 
	{
    printf("GG: %d : %lld\n", events[j], values[j]);
	}
	/* Stop counting events */
	if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK) {
			fprintf(stderr, "PAPI_stoped_counters - FAILED\n");
			exit(1);
	}

	if (values != NULL) {
		free(values);
	}
	
}
Пример #12
0
  /**
   * The main host function called from outside, as part of the API for a single node.
   */
  unsigned int NumericFormFactorC::compute_form_factor(int rank,
//            #ifndef __SSE3__
              real_vec_t &shape_def,
//            #else
//              real_t* shape_def, unsigned int num_triangles,
//            #endif
            complex_t* &ff,
            real_t* &qx, int nqx, real_t* &qy, int nqy, complex_t* &qz, int nqz,
            real_t* &rot,
            real_t& kernel_time, real_t& red_time, real_t& mem_time
            #ifdef FINDBLOCK
              , const int block_x, const int block_y, const int block_z, const int block_t
            #endif
            ) {
    double temp_mem_time = 0.0, total_mem_time = 0.0;
    #ifdef _OPENMP
      if(rank == 0)
        std::cout << "++      Number of OpenMP threads: " << omp_get_max_threads() << std::endl;
    #endif
  
//    #ifndef __SSE3__
      unsigned int num_triangles = shape_def.size() / CPU_T_PROP_SIZE_;
//    #endif
    if(num_triangles < 1) return 0;

//    #ifdef INTEL_SB_AVX
//      unsigned int shape_padding = (32 - (num_triangles & 31)) & 31;
//    #elif defined __SSE3__
//      unsigned int shape_padding = (16 - (num_triangles & 15)) & 15;
//    #endif
  
    //#ifndef FF_NUM_CPU_PADDING
      unsigned long int total_qpoints = nqx * nqy * nqz;
      unsigned long int host_mem_usage = ((unsigned long int) nqx + nqy) * sizeof(real_t) +
                        nqz * sizeof(complex_t);
    //#else
      // padding to 16 bytes
      //const unsigned int PAD_LINE_ = 16;
      //unsigned int pad_x = 0;
      //if(nqx != 1) pad_x = (PAD_LINE_ - (nqx % PAD_LINE_)) % PAD_LINE_;
      //unsigned int pad_y = (PAD_LINE_ - (nqy % PAD_LINE_)) % PAD_LINE_;
      //unsigned int pad_z = (PAD_LINE_ - (nqz % PAD_LINE_)) % PAD_LINE_;
      //unsigned int pnqx = nqx + pad_x, pnqy = nqy + pad_y, pnqz = nqz + pad_z;
      //unsigned long int total_qpoints = pnqx * pnqy * pnqz;
      //unsigned long int host_mem_usage = ((unsigned long int) pnqx + pnqy) * sizeof(real_t) +
      //                  pnqz * sizeof(complex_t);
    //#endif
  
    // allocate memory for the final FF 3D matrix
    ff = new (std::nothrow) complex_t[total_qpoints];  // allocate and initialize to 0
    memset(ff, 0, total_qpoints * sizeof(complex_t));
    if(ff == NULL) {
      std::cerr << "Memory allocation failed for ff. Size = "
            << total_qpoints * sizeof(complex_t) << " b" << std::endl;
      return 0;
    } // if
    host_mem_usage += total_qpoints * sizeof(complex_t);
  
    //unsigned long int matrix_size = (unsigned long int) nqx * nqy * nqz * num_triangles;
    
    // do hyperblocking to use less memory
    unsigned int b_nqx = 0, b_nqy = 0, b_nqz = 0, b_num_triangles = 0;
    #ifndef FF_NUM_CPU_AUTOTUNE_HB
      compute_block_size(nqx, nqy, nqz, num_triangles,
                b_nqx, b_nqy, b_nqz, b_num_triangles
                #ifdef FINDBLOCK
                  , block_x, block_y, block_z, block_t
                #endif
                );
    #else
      std::cout << "-- Autotuning hyperblock size ... " << std::endl;
      double min_time_hb = 1000000.0;
      unsigned int min_b_nqx = 1, min_b_nqy = 1, min_b_nqz = 1, min_b_num_triangles = 1;
      woo::BoostChronoTimer at_kernel_timer, at_overhead_timer;
      at_overhead_timer.start();
      complex_t* ff_temp;
      ff_temp = new (std::nothrow) complex_t[nqx * nqy * nqz];
      for(int b_nqx_i = 1; b_nqx_i <= nqx; ++ b_nqx_i) {
        for(int b_nqy_i = 10; b_nqy_i <= nqy; b_nqy_i += 10) {
          for(int b_nqz_i = 10; b_nqz_i <= nqz; b_nqz_i += 10) {
            for(int b_nt_i = 10; b_nt_i <= num_triangles; b_nt_i += 10) {
              at_kernel_timer.start();

              // compute the number of sub-blocks, along each of the 4 dimensions
              unsigned int nb_x = (unsigned int) ceil((float) nqx / b_nqx_i);
              unsigned int nb_y = (unsigned int) ceil((float) nqy / b_nqy_i);
              unsigned int nb_z = (unsigned int) ceil((float) nqz / b_nqz_i);
              unsigned int nb_t = (unsigned int) ceil((float) num_triangles / b_nt_i);
              unsigned int num_blocks = nb_x * nb_y * nb_z * nb_t;

              form_factor_kernel_fused_nqx1(qx, qy, qz, shape_def,
                  b_nqx_i, b_nqy_i, b_nqz_i, b_nt_i,
                  b_nqx_i, b_nqy_i, b_nqz_i, b_nt_i,
                  nqx, nqy, nqz, num_triangles,
                  0, 0, 0, 0,
                  rot,
                  ff);

              at_kernel_timer.stop();
              double curr_time = at_kernel_timer.elapsed_msec();
              double tot_time = curr_time * num_blocks;
              std::cout << "## " << b_nqx_i << " x " << b_nqy_i << " x " << b_nqz_i
                    << " x " << b_nt_i << "\t" << num_blocks << "\t:\t"
                    << curr_time << "\t" << tot_time << std::endl;
              if(tot_time < min_time_hb) {
                min_time_hb = tot_time;
                min_b_nqx = b_nqx_i; min_b_nqy = b_nqy_i; min_b_nqz = b_nqz_i;
                min_b_num_triangles = b_nt_i;
              } // if
            } // for
          } // for
        } // for
      } // for
      delete[] ff_temp;
      at_overhead_timer.stop();

      b_nqx = min_b_nqx; b_nqy = min_b_nqy; b_nqz = min_b_nqz; b_num_triangles = min_b_num_triangles;
      if(rank == 0) {
        std::cout << "##    HBlock Autotuner overhead: " << at_overhead_timer.elapsed_msec()
              << " ms." << std::endl;
      } // if
    #endif
  
    unsigned long int blocked_3d_matrix_size = (unsigned long int) b_nqx * b_nqy * b_nqz;
    
    //size_t estimated_host_mem_need = host_mem_usage + blocked_matrix_size * sizeof(complex_t);
    //if(rank == 0) {
    //  std::cout << "++    Estimated host memory need: " << (float) estimated_host_mem_need / 1024 / 1024
    //        << " MB" << std::endl;
    //} // if
    #ifndef FF_NUM_CPU_FUSED
      unsigned long int blocked_matrix_size =
                    (unsigned long int) blocked_3d_matrix_size * b_num_triangles;
      host_mem_usage += blocked_matrix_size * sizeof(complex_t);
      complex_t *fq_buffer = new (std::nothrow) complex_t[blocked_matrix_size]();
      if(fq_buffer == NULL) {
        std::cerr << "Memory allocation failed for fq_buffer. blocked_matrix_size = "
              << blocked_matrix_size << std::endl
              << "Host memory usage = " << (float) host_mem_usage / 1024 / 1024 << " MB"
              << std::endl;
        delete[] ff;
        return 0;
      } // if
    #endif
    if(rank == 0) {
      std::cout << "++             Host memory usage: " << (float) host_mem_usage / 1024 / 1024
            << " MB" << std::endl << std::flush;
    } // if

    // compute the number of sub-blocks, along each of the 4 dimensions
    // formulate loops over each dimension, to go over each sub block
    unsigned int nb_x = (unsigned int) ceil((float) nqx / b_nqx);
    unsigned int nb_y = (unsigned int) ceil((float) nqy / b_nqy);
    unsigned int nb_z = (unsigned int) ceil((float) nqz / b_nqz);
    unsigned int nb_t = (unsigned int) ceil((float) num_triangles / b_num_triangles);

    unsigned int curr_b_nqx = b_nqx, curr_b_nqy = b_nqy, curr_b_nqz = b_nqz;
    unsigned int curr_b_num_triangles = b_num_triangles;
    unsigned int num_blocks = nb_x * nb_y * nb_z * nb_t;

    #ifdef TIME_DETAIL_2
      if(rank == 0) {
        std::cout << "++               Hyperblock size: " << b_nqx << " x " << b_nqy
              << " x " << b_nqz << " x " << b_num_triangles << std::endl;
        std::cout << "++  Number of decomposed Hblocks: " << num_blocks
              << " [" << nb_x << " x " << nb_y << " x " << nb_z << " x " << nb_t << "]"
              << std::endl;
      } // if
    #endif // TIME_DETAIL_2

    unsigned int block_num = 0;

    #ifdef PROFILE_PAPI
      long long int papi_total_cycles = 0, papi_total_inst = 0, papi_total_flop = 0;
      double overall_ipc = 0.0;
    #endif

    if(rank == 0) std::cout << "-- Computing form factor on CPU ... " << std::flush;

    woo::BoostChronoTimer kernel_timer;
    kernel_timer.start();

    // compute for each hyperblock
    curr_b_nqx = b_nqx;
    for(unsigned int ib_x = 0; ib_x < nb_x; ++ ib_x) {
      if(ib_x == nb_x - 1) curr_b_nqx = nqx - b_nqx * ib_x;
      curr_b_nqy = b_nqy;
      for(unsigned int ib_y = 0; ib_y < nb_y; ++ ib_y) {
        if(ib_y == nb_y - 1) curr_b_nqy = nqy - b_nqy * ib_y;
        curr_b_nqz = b_nqz;
        for(unsigned int ib_z = 0; ib_z < nb_z; ++ ib_z) {
          if(ib_z == nb_z - 1) curr_b_nqz = nqz - b_nqz * ib_z;
          curr_b_num_triangles = b_num_triangles;
          for(unsigned int ib_t = 0; ib_t < nb_t; ++ ib_t) {
            if(ib_t == nb_t - 1)
              curr_b_num_triangles = num_triangles - b_num_triangles * ib_t;

            #ifdef PROFILE_PAPI
              // PAPI_L1_DCM  0x80000000  No   Level 1 data cache misses
              // PAPI_L1_ICM  0x80000001  No   Level 1 instruction cache misses
              // PAPI_L2_DCM  0x80000002  No   Level 2 data cache misses
              // PAPI_L2_ICM  0x80000003  No   Level 2 instruction cache misses
              // PAPI_L1_TCM  0x80000006  Yes  Level 1 cache misses
              // PAPI_L2_TCM  0x80000007  No   Level 2 cache misses
              // PAPI_FPU_IDL 0x80000012  No   Cycles floating point units are idle
              // PAPI_TLB_DM  0x80000014  No   Data translation lookaside buffer misses
              // PAPI_TLB_IM  0x80000015  No   Instruction translation lookaside buffer misses
              // PAPI_TLB_TL  0x80000016  Yes  Total translation lookaside buffer misses
              // PAPI_STL_ICY 0x80000025  No   Cycles with no instruction issue
              // PAPI_HW_INT  0x80000029  No   Hardware interrupts
              // PAPI_BR_TKN  0x8000002c  No   Conditional branch instructions taken
              // PAPI_BR_MSP  0x8000002e  No   Conditional branch instructions mispredicted
              // PAPI_TOT_INS 0x80000032  No   Instructions completed
              // PAPI_FP_INS  0x80000034  No   Floating point instructions
              // PAPI_BR_INS  0x80000037  No   Branch instructions
              // PAPI_VEC_INS 0x80000038  No   Vector/SIMD instructions (could include integer)
              // PAPI_RES_STL 0x80000039  No   Cycles stalled on any resource
              // PAPI_TOT_CYC 0x8000003b  No   Total cycles
              // PAPI_L1_DCH  0x8000003e  Yes  Level 1 data cache hits
              // PAPI_L2_DCH  0x8000003f  Yes  Level 2 data cache hits
              // PAPI_L1_DCA  0x80000040  No   Level 1 data cache accesses
              // PAPI_L2_DCA  0x80000041  No   Level 2 data cache accesses
              // PAPI_L1_ICH  0x80000049  Yes  Level 1 instruction cache hits
              // PAPI_L2_ICH  0x8000004a  No   Level 2 instruction cache hits
              // PAPI_L1_ICA  0x8000004c  No   Level 1 instruction cache accesses
              // PAPI_L2_ICA  0x8000004d  No   Level 2 instruction cache accesses
              // PAPI_L1_ICR  0x8000004f  No   Level 1 instruction cache reads
              // PAPI_L1_TCH  0x80000055  Yes  Level 1 total cache hits
              // PAPI_L2_TCH  0x80000056  Yes  Level 2 total cache hits
              // PAPI_L1_TCA  0x80000058  Yes  Level 1 total cache accesses
              // PAPI_L2_TCA  0x80000059  No   Level 2 total cache accesses
              // PAPI_FML_INS 0x80000061  No   Floating point multiply instructions
              // PAPI_FAD_INS 0x80000062  No   Floating point add instructions
              //                               (Also includes subtract instructions)
              // PAPI_FDV_INS 0x80000063  No   Floating point divide instructions
              //                               (Counts both divide and square root instructions)
              // PAPI_FSQ_INS 0x80000064  No   Floating point square root instructions
              //                               (Counts both divide and square root instructions)
              // PAPI_FP_OPS  0x80000066  No   Floating point operations
              // PAPI_SP_OPS  0x80000067  No   Floating point operations; optimized to count
              //                               scaled single precision vector operations
              // PAPI_DP_OPS  0x80000068  No   Floating point operations; optimized to count
              //                               scaled double precision vector operations

              int papi_events[3] = { PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_FP_OPS };
              //int papi_events[3] = { PAPI_FML_INS, PAPI_FAD_INS, PAPI_FDV_INS };
              //int papi_events[3] = { PAPI_FP_OPS, PAPI_SP_OPS, PAPI_DP_OPS };
              long long  papi_counter_values[3];
              PAPI_start_counters(papi_events, 3);
            #endif

            // call the main kernel
            #ifndef FF_NUM_CPU_FUSED // DO NOT USE THIS
              form_factor_kernel(qx, qy, qz, shape_def,
                  curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles,
                  b_nqx, b_nqy, b_nqz, b_num_triangles,
                  ib_x, ib_y, ib_z, ib_t,
                  fq_buffer);
            #else
              if(nqx == 1) {
                form_factor_kernel_fused_nqx1(qx, qy, qz, shape_def,
                //form_factor_kernel_fused_nqx1_unroll4(qx, qy, qz, shape_def,
                    curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles,
                    b_nqx, b_nqy, b_nqz, b_num_triangles,
                    nqx, nqy, nqz, num_triangles,
                    ib_x, ib_y, ib_z, ib_t,
                    rot,
                    ff);
              } else {
//                #ifdef __SSE3__
//                  if(rank == 0)
//                    std::cout << "uh-oh: no SSE3 version!" << std::endl;
//                #else
                  form_factor_kernel_fused_unroll4(qx, qy, qz, shape_def,
                    curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles,
                    b_nqx, b_nqy, b_nqz, b_num_triangles,
                    nqx, nqy, nqz, num_triangles,
                    ib_x, ib_y, ib_z, ib_t,
                    rot,
                    ff);
//                #endif // __SSE3__
              } // if-else
            #endif

            #ifndef FF_NUM_CPU_FUSED // DO NOT USE THIS
              // call the reduction kernel
              reduction_kernel(curr_b_nqx, curr_b_nqy, curr_b_nqz,
                  curr_b_num_triangles, blocked_matrix_size,
                  b_nqx, b_nqy, b_nqz, num_triangles,
                  nqx, nqy, nqz,
                  ib_x, ib_y, ib_z, ib_t,
                  fq_buffer, ff);
            #endif

            #ifdef PROFILE_PAPI
              PAPI_stop_counters(papi_counter_values, 3);
              papi_total_cycles += papi_counter_values[0];
              papi_total_inst += papi_counter_values[1];
              papi_total_flop += papi_counter_values[2];
            #endif
          } // for ib_t
        } // for ib_z
      } // for ib_y
    } // for ib_x

    kernel_timer.stop();
    kernel_time = kernel_timer.elapsed_msec();

    #ifndef FF_NUM_CPU_FUSED
      delete[] fq_buffer;
    #endif

    if(rank == 0) std::cout << "done." << std::endl;

    #ifdef PROFILE_PAPI
      if(rank == 0) {
        std::cout << "++                  PAPI_TOT_CYC: " << papi_total_cycles << std::endl;
        std::cout << "++                  PAPI_TOT_INS: " << papi_total_inst << std::endl;
        std::cout << "++                   PAPI_FP_OPS: " << papi_total_flop << std::endl;
        std::cout << "++                           IPC: "
              << (double) papi_total_inst / papi_total_cycles << std::endl;
      } // if
    #endif

    return num_triangles;
  } // NumericFormFactorC::compute_form_factor()
Пример #13
0
int main(int argc, char** argv) {
  int Events[NUM_EVENTS]; 
  const char* names[NUM_EVENTS] = {"READ_CALLS", "READ_BYTES", "READ_BLOCK_SIZE", "READ_USEC", "SEEK_CALLS", "SEEK_USEC", "SEEK_ABS_STRIDE_SIZE"};
  long long values[NUM_EVENTS];

  char *infile = "/etc/group";

  /* Set TESTS_QUIET variable */
  tests_quiet( argc, argv );

  int version = PAPI_library_init (PAPI_VER_CURRENT);
  if (version != PAPI_VER_CURRENT) {
    fprintf(stderr, "PAPI_library_init version mismatch\n");
    exit(1);
  }

  int fdin;
  if (!TESTS_QUIET) printf("This program will do a strided read %s and write it to stdout\n", infile);
  int retval;
  int e;
  for (e=0; e<NUM_EVENTS; e++) {
    retval = PAPI_event_name_to_code((char*)names[e], &Events[e]);
    if (retval != PAPI_OK) {
      fprintf(stderr, "Error getting code for %s\n", names[e]);
      exit(2);
    } 
  }

  /* Start counting events */
  if (PAPI_start_counters(Events, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_start_counters\n");
    exit(1);
  }

  fdin=open(infile, O_RDONLY);
  if (fdin < 0) perror("Could not open file for reading: \n");
  int bytes = 0;
  char buf[1024];

 
//if (PAPI_read_counters(values, NUM_EVENTS) != PAPI_OK)
//   handle_error(1);
//printf("After reading the counters: %lld\n",values[0]);

  while ((bytes = read(fdin, buf, 32)) > 0) {
    write(1, buf, bytes);
    lseek(fdin, 16, SEEK_CUR);
  }

  /* Closing the descriptors before doing the PAPI_stop
     means, OPEN_FDS will be reported as zero, which is
     right, since at the time of PAPI_stop, the descriptors
     we opened have been closed */
  close (fdin);

  /* Stop counting events */
  if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_stop_counters\n");
  }
 
  if (!TESTS_QUIET) { 
    printf("----\n");
    for (e=0; e<NUM_EVENTS; e++)  
      printf("%s: %lld\n", names[e], values[e]);
  }
  test_pass( __FILE__, NULL, 0 );
  return 0;
}
Пример #14
0
int main(int argc, char **argv) {
   
   int retval,quiet,result;

   int num_runs=100;
   long long high=0,low=0,average=0,expected=1000000;
   int num_random_branches=500000;

   int i;
   int events[1];
   long long counts[1],total=0;

   char test_string[]="Testing PAPI_BR_MSP predefined event...";
   
   quiet=test_quiet();

   retval = PAPI_library_init(PAPI_VER_CURRENT);
   if (retval != PAPI_VER_CURRENT) {
      if (!quiet) printf("ERROR:PAPI_library_init %d\n", retval);
      test_fail(test_string);
   }

   retval = PAPI_query_event(PAPI_BR_MSP);
   if (retval != PAPI_OK) {
      if (!quiet) printf("PAPI_BR_MSP not supported %d\n", retval);
      test_skip(test_string);
   }

   if (!quiet) {
      printf("\n");   

      printf("Testing a loop with %lld branches (%d times):\n",
          expected,num_runs);
   }

   events[0]=PAPI_BR_MSP;
   high=0;
   low=0;

   for(i=0;i<num_runs;i++) {

     PAPI_start_counters(events,1);

     result=branches_testcode();

     PAPI_stop_counters(counts,1);

     if (result==CODE_UNIMPLEMENTED) {
       if (!quiet) printf("\tNo test code for this architecture\n");
       test_skip(test_string);
     }

     if (counts[0]>high) high=counts[0];
     if ((low==0) || (counts[0]<low)) low=counts[0];
     total+=counts[0];
   }

   average=total/num_runs;

   if (!quiet) {

      printf("\tFound %lld mispredicts out of %lld branches\n",
	  average,expected);
      printf("\tA simple loop like this should have very few mispredicts\n");
   }


   if (average>1000) {
      if (!quiet) printf("Too many mispredicts\n");
      test_fail(test_string);
   }
   if (!quiet) printf("\n");

   /*******************/

   high=0; low=0; total=0;

   events[0]=PAPI_BR_CN;

   for(i=0;i<num_runs;i++) {

     PAPI_start_counters(events,1);

     result=random_branches_testcode(num_random_branches,1);

     PAPI_stop_counters(counts,1);

     if (counts[0]>high) high=counts[0];
     if ((low==0) || (counts[0]<low)) low=counts[0];
     total+=counts[0];
   }

   average=total/num_runs;

   expected=average;

   if (!quiet) {
      printf("\nTesting a function that branches based on a random number\n");
      printf("   The loop has %lld conditional branches.\n",expected);
      printf("   %d are random branches; %d of those were taken\n",num_random_branches,result);
   }


   high=0; low=0; total=0;

   events[0]=PAPI_BR_MSP;

   for(i=0;i<num_runs;i++) {

     PAPI_start_counters(events,1);

     result=random_branches_testcode(num_random_branches,1);

     PAPI_stop_counters(counts,1);

     if (counts[0]>high) high=counts[0];
     if ((low==0) || (counts[0]<low)) low=counts[0];
     total+=counts[0];
   }

   average=total/num_runs;

   if (!quiet) {

      printf("\nOut of %lld branches, %lld were mispredicted\n",expected,average);
      printf("Assuming a good random number generator and no freaky luck\n");
      printf("The mispredicts should be roughly between %d and %d\n",
	     num_random_branches/4,(num_random_branches/4)*3);
   }

   if ( average < (num_random_branches/4)) {
     if (!quiet) printf("Mispredicts too low\n");
     test_fail(test_string);
   }

   if (average > (num_random_branches/4)*3) { 

     if (!quiet) printf("Mistpredicts too high\n");
     test_fail(test_string);
   }
   if (!quiet) printf("\n");

   PAPI_shutdown();

   test_pass(test_string);
   
   return 0;
}
Пример #15
0
int main(int argc, char **argv) {

   int retval,quiet,result;
   int i,events[1];
   long long counts[1];


   char test_string[]="Testing PAPI_HW_INT predefined event...";

   quiet=test_quiet();

   retval = PAPI_library_init(PAPI_VER_CURRENT);
   if (retval != PAPI_VER_CURRENT) {
      if (!quiet) printf("Error: PAPI_library_init: %d\n", retval);
      test_fail(test_string);
   }

   retval = PAPI_query_event(PAPI_HW_INT);
   if (retval != PAPI_OK) {
      if (!quiet) printf("PAPI_HW_INT not supported");
      test_skip(test_string);
   }

   events[0]=PAPI_HW_INT;

   if (!quiet) {
      printf("\n");
      printf("Testing a loop of 1 million instructions (%d times):\n",
          NUM_RUNS);
      printf("A certain number of interrupts should happen (mostly timer)\n");
   }

   PAPI_start_counters(events,1);


   for(i=0;i<NUM_RUNS;i++) {
      result=instructions_million();
   }

   PAPI_stop_counters(counts,1);

   if (result==CODE_UNIMPLEMENTED) {
      fprintf(stderr,"\tCode unimplemented\n");
      test_fail(test_string);
   }

   if (!quiet) {
      printf("   Expected: >0\n");
      printf("   Obtained: %lld\n",counts[0]);
      printf("\n");
   }

   if (counts[0] == 0) {
      if (!quiet) printf("Error: Interrupt count was zero\n");
      test_fail(test_string);
   }

   PAPI_shutdown();

   test_pass(test_string);

   return 0;
}
int main( int argc, char *argv[] ) {
    int Events[] = {
#ifdef CACHE_PROFILE
        PAPI_L2_TCM,
        PAPI_L3_TCM,
        PAPI_L2_TCA,
        PAPI_L3_TCA
#else
        PAPI_FP_OPS
#endif
    };
    long long values[SIZE( Events )];
    long long tic;

    if( argc != 4 ) {
        printf( "Usage: %s input_format input_file output_prefix\n", argv[0] );
        return EXIT_FAILURE;
    }

    char *input_format = argv[1];
    char *input_file = argv[2];
    char *output_prefix = argv[3];

    int status = 0;

    /** internal cells start and end index*/
    int nintci, nintcf;
    /** external cells start and end index.
     * The external cells are only ghost cells. They are accessed only through internal cells*/
    int nextci, nextcf;
    /** link cell-to-cell array. Stores topology information*/
    int **lcc;
    /** red-black colouring of the cells*/
    int *nboard;

    /** boundary coefficients for each volume cell */
    double *bs, *be, *bn, *bw, *bl, *bh, *bp, *su;

    char pstats_filename[strlen( output_prefix ) + strlen( "pstats.dat" ) + 1];
    strcpy( pstats_filename, output_prefix );
    strcat( pstats_filename, "pstats.dat" );

    FILE *pstats = fopen( pstats_filename, "w" );
    if( pstats == NULL ) {
        printf( "Cannot open file for writing: %s\n", pstats_filename );
        return EXIT_FAILURE;
    }

    /* Start counting events */
    if( PAPI_start_counters( Events, SIZE( Events ) ) != PAPI_OK ) {
        handle_error( 1 );
    }

    /** start measuring wall clock time */
    tic = PAPI_get_real_usec();

    /* initialization  */
    // read-in the input file
    if( !strcmp( "bin", input_format ) ) {
        status = read_binary( input_file, &nintci, &nintcf, &nextci, &nextcf, &lcc,
                              &bs, &be, &bn, &bw, &bl, &bh, &bp, &su, &nboard );
    } else if( !strcmp( "text", input_format ) ) {
        status = read_formatted( input_file, &nintci, &nintcf, &nextci, &nextcf, &lcc,
                                 &bs, &be, &bn, &bw, &bl, &bh, &bp, &su, &nboard );
    } else {
        printf( "valid input_format values: text, bin\n" );
        return EXIT_FAILURE;
    }

    if( status != 0 ) {
        printf( "failed to initialize data!\n" );
        return EXIT_FAILURE;
    }

    /* Print profile data for phase INPUT */
    log_counters( pstats, "INPUT", &tic, values );

    // allocate arrays used in gccg
    int nomax = 3;
    /** the reference residual*/
    double resref = 0.0;
    /** the ratio between the reference and the current residual*/
    double ratio;

    /** array storing residuals */
    double *resvec = ( double * ) calloc( sizeof( double ), ( nintcf + 1 ) );
    /** the variation vector -> keeps the result in the end */
    double *var = ( double * ) calloc( sizeof( double ), ( nextcf + 1 ) );

    /** the computation vectors */
    double *direc1 = ( double * ) calloc( sizeof( double ), ( nextcf + 1 ) );
    double *direc2 = ( double * ) calloc( sizeof( double ), ( nextcf + 1 ) );

    /** additional vectors */
    double *cgup = ( double * ) calloc( sizeof( double ), ( nextcf + 1 ) );
    double *oc = ( double * ) calloc( sizeof( double ), ( nintcf + 1 ) );
    double *cnorm = ( double * ) calloc( sizeof( double ), ( nintcf + 1 ) );
    double *adxor1 = ( double * ) calloc( sizeof( double ), ( nintcf + 1 ) );
    double *adxor2 = ( double * ) calloc( sizeof( double ), ( nintcf + 1 ) );
    double *dxor1 = ( double * ) calloc( sizeof( double ), ( nintcf + 1 ) );
    double *dxor2 = ( double * ) calloc( sizeof( double ), ( nintcf + 1 ) );

    // initialize the reference residual
    for( int nc = nintci; nc <= nintcf; nc++ ) {
        resvec[nc] = su[nc];
        resref = resref + resvec[nc] * resvec[nc];
    }
    resref = sqrt( resref );
    if( resref < 1.0e-15 ) {
        printf( "i/o - error: residue sum less than 1.e-15 - %lf\n", resref );
        return EXIT_FAILURE;
    }

    // initialize the arrays
    for( int nc = 0; nc <= 10; nc++ ) {
        oc[nc] = 0.0;
        cnorm[nc] = 1.0;
    }

    for( int nc = nintci; nc <= nintcf; nc++ ) {
        cgup[nc] = 0.0;
        var[nc] = 0.0;
    }

    for( int nc = nextci; nc <= nextcf; nc++ ) {
        var[nc] = 0.0;
        cgup[nc] = 0.0;
        direc1[nc] = 0.0;
        bs[nc] = 0.0;
        be[nc] = 0.0;
        bn[nc] = 0.0;
        bw[nc] = 0.0;
        bl[nc] = 0.0;
        bh[nc] = 0.0;
    }

    for( int nc = nintci; nc <= nintcf; nc++ ) {
        cgup[nc] = 1.0 / bp[nc];
    }

    int if1 = 0;
    int if2 = 0;
    int iter = 1;
    int nor = 1;
    int nor1 = nor - 1;
    /* finished initalization */

    /* start computation loop */
    while( iter < 10000 ) {
        /* start phase 1 */

        // update the old values of direc
        for( int nc = nintci; nc <= nintcf; nc++ ) {
            direc1[nc] = direc1[nc] + resvec[nc] * cgup[nc];
        }

        // compute new guess (approximation) for direc
        for( int nc = nintci; nc <= nintcf; nc++ ) {
            direc2[nc] = bp[nc] * direc1[nc] - bs[nc] * direc1[lcc[0][nc]]
                         - bw[nc] * direc1[lcc[3][nc]] - bl[nc] * direc1[lcc[4][nc]]
                         - bn[nc] * direc1[lcc[2][nc]] - be[nc] * direc1[lcc[1][nc]]
                         - bh[nc] * direc1[lcc[5][nc]];
        } /* end phase 1 */

        /*  start phase 2 */
        // execute normalization steps
        double oc1, oc2, occ;
        if( nor1 == 1 ) {
            oc1 = 0;
            occ = 0;
            for( int nc = nintci; nc <= nintcf; nc++ ) {
                occ = occ + adxor1[nc] * direc2[nc];
            }
            oc1 = occ / cnorm[1];
            for( int nc = nintci; nc <= nintcf; nc++ ) {
                direc2[nc] = direc2[nc] - oc1 * adxor1[nc];
                direc1[nc] = direc1[nc] - oc1 * dxor1[nc];
            }
            if1++;

        } else if( nor1 == 2 ) {
            oc1 = 0;
            occ = 0;
            for( int nc = nintci; nc <= nintcf; nc++ ) {
                occ = occ + adxor1[nc] * direc2[nc];
            }

            oc1 = occ / cnorm[1];
            oc2 = 0;
            occ = 0;
            for( int nc = nintci; nc <= nintcf; nc++ ) {
                occ = occ + adxor2[nc] * direc2[nc];
            }

            oc2 = occ / cnorm[2];
            for( int nc = nintci; nc <= nintcf; nc++ ) {
                direc2[nc] = direc2[nc] - oc1 * adxor1[nc] - oc2 * adxor2[nc];
                direc1[nc] = direc1[nc] - oc1 * dxor1[nc] - oc2 * dxor2[nc];
            }

            if2++;
        }

        cnorm[nor] = 0;
        double omega = 0;

        // compute the new residual
        for( int nc = nintci; nc <= nintcf; nc++ ) {
            cnorm[nor] = cnorm[nor] + direc2[nc] * direc2[nc];
            omega = omega + resvec[nc] * direc2[nc];
        }
        omega = omega / cnorm[nor];

        double resnew = 0.0;
        for( int nc = nintci; nc <= nintcf; nc++ ) {
            var[nc] = var[nc] + omega * direc1[nc];
            resvec[nc] = resvec[nc] - omega * direc2[nc];
            resnew = resnew + resvec[nc] * resvec[nc];
        }
        resnew = sqrt( resnew );
        ratio = resnew / resref;

        // exit on no improvements of residual
        if( ratio <= 1.0e-10 ) {
            break;
        }

        iter++;

        // prepare additional arrays for the next iteration step
        if( nor == nomax ) {
            nor = 1;
        } else {
            if( nor == 1 ) {
                for( int nc = nintci; nc <= nintcf; nc++ ) {
                    dxor1[nc] = direc1[nc];
                    adxor1[nc] = direc2[nc];
                }

            } else if( nor == 2 ) {
                for( int nc = nintci; nc <= nintcf; nc++ ) {
                    dxor2[nc] = direc1[nc];
                    adxor2[nc] = direc2[nc];
                }
            }
            nor++;
        }
        nor1 = nor - 1;
    }/* end phase 2 */

    /* finished computation loop */

    /* Print profile data for phase CALC */
    log_counters( pstats, "CALC", &tic, values );

    /* write output file  */
    int nodeCnt;
    int **points, **elems;

    if( vol2mesh( nintci, nintcf, lcc, &nodeCnt, &points, &elems ) != 0 ) {
        printf( "error during conversion from volume to mesh\n" );
    }

    write_vtk( output_prefix, "VAR.vtk", nintci, nintcf, nodeCnt, points, elems, var );
    write_vtk( output_prefix, "CGUP.vtk", nintci, nintcf, nodeCnt, points, elems, cgup );
    write_vtk( output_prefix, "SU.vtk", nintci, nintcf, nodeCnt, points, elems, su );

    /* Print profile data for phase OUTPUT */
    log_counters( pstats, "OUTPUT", &tic, values );

    /* Stop counting events */
    if( PAPI_stop_counters( values, SIZE( values ) ) != PAPI_OK ) {
        handle_error( 1 );
    }

    fclose( pstats );

#if 0
    /* Free all the dynamically allocated memory */
    free( direc2 );
    free( direc1 );
    free( dxor2 );
    free( dxor1 );
    free( adxor2 );
    free( adxor1 );
    free( cnorm );
    free( oc );
    free( var );
    free( cgup );
    free( resvec );
    free( su );
    free( bp );
    free( bh );
    free( bl );
    free( bw );
    free( bn );
    free( be );
    free( bs );
#endif

    printf( "Simulation completed successfully!\n" );
    return EXIT_SUCCESS;
}
Пример #17
0
int main(int argc, char** argv) {
  int Events[NUM_EVENTS]; 
  const char* names[NUM_EVENTS] = {"OPEN_CALLS", "OPEN_FDS", "READ_CALLS", "READ_BYTES", "READ_USEC", "READ_ERR", "READ_INTERRUPTED", "READ_WOULD_BLOCK", "WRITE_CALLS","WRITE_BYTES","WRITE_USEC"};
  long long values[NUM_EVENTS];

  char *infile = "/etc/group";

  int version = PAPI_library_init (PAPI_VER_CURRENT);
  if (version != PAPI_VER_CURRENT) {
    fprintf(stderr, "PAPI_library_init version mismatch\n");
    exit(1);
  }

  int fdin;
  if (!TESTS_QUIET) fprintf(stderr, "This program will read %s and write it to /dev/null\n", infile);
  int retval;
  int e;
  for (e=0; e<NUM_EVENTS; e++) {
    retval = PAPI_event_name_to_code((char*)names[e], &Events[e]);
    if (retval != PAPI_OK) {
      fprintf(stderr, "Error getting code for %s\n", names[e]);
      exit(2);
    } 
  }

  /* Start counting events */
  if (PAPI_start_counters(Events, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_start_counters\n");
    exit(1);
  }

  fdin=open(infile, O_RDONLY);
  if (fdin < 0) perror("Could not open file for reading: \n");
  int fdout;
  fdout=open("/dev/null", O_WRONLY);
  if (fdout < 0) perror("Could not open file for writing: \n");
  int bytes = 0;
  char buf[1024];

 
//if (PAPI_read_counters(values, NUM_EVENTS) != PAPI_OK)
//   handle_error(1);
//printf("After reading the counters: %lld\n",values[0]);

  while ((bytes = read(fdin, buf, 1024)) > 0) {
    write(fdout, buf, bytes);
  }


  /* Stop counting events */
  if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_stop_counters\n");
  }
  close (fdin);
  close (fdout);
 
  if (!TESTS_QUIET) { 
    printf("----\n");
    for (e=0; e<NUM_EVENTS; e++)  
      printf("%s: %lld\n", names[e], values[e]);
  }
  test_pass( __FILE__, NULL, 0 );
  return 0;
}
Пример #18
0
int main(int argc, char **argv) {

   int events[1],i;
   long long counts[1];
   
   int retval,quiet;
   int l1_size,l2_size,l1_linesize,l2_entries;
   int arraysize;

   char test_string[]="Testing PAPI_L2_DCM predefined event...";
   
   quiet=test_quiet();

   retval = PAPI_library_init(PAPI_VER_CURRENT);
   if (retval != PAPI_VER_CURRENT) {
      if (!quiet) printf("Error! PAPI_library_init %d\n",retval);
      test_fail(test_string);
   }

   retval = PAPI_query_event(PAPI_L2_DCM);
   if (retval != PAPI_OK) {
      if (!quiet) printf("PAPI_L2_DCM not available\n");
      test_skip(test_string);
   }

   events[0]=PAPI_L2_DCM;

   l1_size=get_cachesize(L1D_CACHE,quiet,test_string);
   l1_linesize=get_linesize(L1D_CACHE,quiet,test_string);
   l2_size=get_cachesize(L2_CACHE,quiet,test_string);
   l2_entries=get_entries(L2_CACHE,quiet,test_string);

   /*******************************************************************/
   /* Test if the C compiler uses a sane number of data cache acceess */
   /*******************************************************************/

   arraysize=l2_size/sizeof(double);

   double *array;
   double aSumm = 0.0;

   if (!quiet) {
      printf("Allocating %ld bytes of memory (%d doubles)\n",
          arraysize*sizeof(double),arraysize);
   }

   array=calloc(arraysize,sizeof(double));
   if (array==NULL) {
      if (!quiet) printf("Error! Can't allocate memory\n");
      test_fail(test_string);
   }

   if (!quiet) printf("Write test:\n");
   PAPI_start_counters(events,1);
   
   for(i=0; i<arraysize; i++) { 
      array[i]=(double)i;
   }
     
   PAPI_stop_counters(counts,1);

   if (!quiet) {
      printf("\tL2 D misses: %lld\n",counts[0]);
      printf("\tShould be roughly (%d/(%d/%ld)): %ld\n",
          arraysize,l1_linesize,sizeof(double),
          arraysize/(l1_linesize/sizeof(double)));
   }

   PAPI_start_counters(events,1);
   
   for(i=0; i<arraysize; i++) { 
       aSumm += array[i]; 
   }
     
   PAPI_stop_counters(counts,1);

   if (!quiet) {
      printf("Read test (%lf):\n",aSumm);
      printf("\tL2 D misses: %lld\n",counts[0]);
      printf("\tShould be roughly (%d/(%d/%ld)): %ld\n",
          arraysize,l1_linesize,sizeof(double),
          arraysize/(l1_linesize/sizeof(double)));
   }

   PAPI_shutdown();

   test_pass(test_string);
   
   return 0;
}
Пример #19
0
int main (int argc, char *argv[]){

	int size = atoi (argv[1]);

	/*PAPI time measurement*/
	long_long start_us, stop_us;

	/*matrixes*/
	float **mat_a, **mat_b, **mat_c;
	mat_a = (float**) malloc (size * sizeof(float*));
	mat_b = (float**) malloc (size * sizeof(float*));
	mat_c = (float**) malloc (size * sizeof(float*));

	for (unsigned i = 0; i < size; ++i) {
		mat_a[i] = (float*) malloc (size * sizeof(float)); 
		mat_b[i] = (float*) malloc (size * sizeof(float)); 
		mat_c[i] = (float*) malloc (size * sizeof(float)); 
	}

	//Fill Matrices
	fillMatrices(mat_a, mat_b, mat_c, size );
	clearCache();

	/* PAPI VARIABLES */
	int events[NUM_EVENTS];
	long long counts[NUM_EVENTS];

	/* FLOATING POINT OPERATIONS */
	events[0]=PAPI_VEC_SP;	
	events[1]=PAPI_SP_OPS;

	/**/
	events[2]=PAPI_FP_OPS;
	events[3]=PAPI_DP_OPS;
	events[4]=PAPI_VEC_DP;

	counts[0] = 0;
	counts[1] = 0;
	counts[2] = 0;
	counts[3] = 0;
	counts[4] = 0;

	PAPI_library_init(PAPI_VER_CURRENT);

	PAPI_start_counters(events,2);
	start_us = PAPI_get_real_usec();


	matrix_mult_ikj ( mat_a, mat_b, mat_c, size);	


	stop_us = PAPI_get_real_usec();
	PAPI_stop_counters(counts,2);

	long_long duration_us = stop_us - start_us;

	FILE *file;
	file = fopen(argv[2],"a");

	fprintf(file, "%lld,", duration_us );
	printf("%lld,", duration_us );	
	fprintf(file,"%lld,", counts[0]);	
	printf("%lld,", counts[0]);	
	fprintf(file,"%lld\n", counts[1]);	
	printf("%lld\n", counts[1]);	

	fclose(file);
	return 0;
}
Пример #20
0
int main(int argc, char *argv[]) {

    if (argc < 4) {
	   printf("Usage: %s data_type(text or bin) input_file output_file\n", argv[0]);
	   return EXIT_FAILURE;
    }

    char *file_type = argv[1]; 
    char *file_in = argv[2];
    char *file_out = argv[3];
    char *str1 = "SU.vtk";
    char *str2 = "VAR.vtk";
    char *str3 = "CGUP.vtk";
    char *file_perf = "pstats.dat";    
    int status = 0;
    /** internal cells start and end index*/
    int nintci, nintcf;
    /** external cells start and end index. The external cells are only ghost cells. 
    They are accessed only through internal cells*/
    int nextci, nextcf;
    /** link cell-to-cell array. Stores topology information*/
    int **lcc;
    /** red-black colouring of the cells*/
    int *nboard;
    /** boundary coefficients for each volume cell */
    double *bs, *be, *bn, *bw, *bl, *bh, *bp, *su;
    /**parameter used for volmesh and reading binary input file */
    int* nodeCnt;
    int*** points;
    int*** elems;
    /**Measured Performance and Papi parameters*/
    long long *values_i = (long long *) calloc(sizeof(long long), 4);
    long long *values_c = (long long *) calloc(sizeof(long long), 4);
    long long *values_o = (long long *) calloc(sizeof(long long), 4);  
    double *mflops = (double *) calloc(sizeof(double), 3);
    double *L1mira = (double *) calloc(sizeof(double), 3);
    double *Lmirate = (double *) calloc(sizeof(double), 3);
    double *util = (double *) calloc(sizeof(double), 3);
    long long *et = (long long *) calloc(sizeof(long long), 3);
    long long start_cycles, start_usec,end_cycles_1, end_usec_1, end_cycles_2, end_cycles_3, end_usec_2, end_usec_3;
    /**In cluster mpp_inter L1 and L2 events can not computed at the same time, 
    so set into two groups*/
    int Events[NUM_EVENTS]={PAPI_L2_TCM,PAPI_L2_TCA,PAPI_FP_INS,PAPI_TOT_CYC};
    // int Events[NUM_EVENTS]={PAPI_L1_TCM,PAPI_L1_TCA,PAPI_FP_INS,PAPI_TOT_CYC};
    /**start HW counters and execution time recorder*/
    if ( PAPI_start_counters( Events, NUM_EVENTS ) != PAPI_OK )
    printf("Fail to start PAPI counter\n");    
    start_cycles = PAPI_get_real_cyc(); // Gets the starting time in clock cycles
    start_usec = PAPI_get_real_usec(); // Gets the starting time in microseconds
    /* initialization  */
    // read-in the input file
    int f_status;
    if (strcmp(file_type,"text") == 0) {

        f_status = read_formatted(file_in, &nintci, &nintcf, &nextci, &nextcf, &lcc,
		   &bs, &be, &bn, &bw, &bl, &bh, &bp, &su, &nboard);
    } else if (strcmp(file_type,"bin") == 0) { 

        f_status = read_formatted_bin(file_in, &nintci, &nintcf, &nextci,
                   &nextcf, &lcc, &bs, &be, &bn, &bw,
                   &bl, &bh, &bp, &su,&nboard);
    } else { 

        printf ("Input file format is nor correct\n");
             return EXIT_FAILURE;
    }
    if (f_status != 0){	

        printf("failed to initialize data!\n");
	return EXIT_FAILURE;
    }
    // allocate arrays used in gccg
    int nomax = 3;
    /** the reference residual*/
    double resref = 0.0;
    /** the ratio between the reference and the current residual*/
    double ratio;
    /** array storing residuals */
    double* resvec = (double *) calloc(sizeof(double), (nintcf + 1));
    /** the variation vector -> keeps the result in the end */
    double* var = (double *) calloc(sizeof(double), (nextcf + 1));
    /** the computation vectors */
    double* direc1 = (double *) calloc(sizeof(double), (nextcf + 1));
    double* direc2 = (double *) calloc(sizeof(double), (nextcf + 1));
    /** additional vectors */
    double* cgup = (double *) calloc(sizeof(double), (nextcf + 1));
    double* oc = (double *) calloc(sizeof(double), (nintcf + 1));
    double* cnorm = (double *) calloc(sizeof(double), (nintcf + 1));
    double* adxor1 = (double *) calloc(sizeof(double), (nintcf + 1));
    double* adxor2 = (double *) calloc(sizeof(double), (nintcf + 1));
    double* dxor1 = (double *) calloc(sizeof(double), (nintcf + 1));
    double* dxor2 = (double *) calloc(sizeof(double), (nintcf + 1));
    /**store volume information*/
    int nc=0;
    // initialize the reference residual
    for ( nc = nintci; nc <= nintcf; nc++) {
	  resvec[nc] = su[nc];
	  resref = resref + resvec[nc] * resvec[nc];
    }
    resref = sqrt(resref);
    if (resref < 1.0e-15){

	printf("i/o - error: residue sum less than 1.e-15 - %lf\n", resref);
	return EXIT_FAILURE;
    }

    // initialize the arrays
    for (nc = 0; nc <= 10; nc++){
	oc[nc] = 0.0;
	cnorm[nc] = 1.0;
    }

    for (nc = nintci; nc <= nintcf; nc++){
	cgup[nc] = 0.0;
	var[nc] = 0.0;
    }

    for (nc = nextci; nc <= nextcf; nc++){
	var[nc] = 0.0;
	cgup[nc] = 0.0;
	direc1[nc] = 0.0;
	bs[nc] = 0.0;
	be[nc] = 0.0;
	bn[nc] = 0.0;
	bw[nc] = 0.0;
	bl[nc] = 0.0;
	bh[nc] = 0.0;
    }

    for (nc = nintci; nc <= nintcf; nc++){
	cgup[nc] = 1.0 / bp[nc];
    }
    int if1 = 0;
    int if2 = 0;
    int iter = 1;
    int nor = 1;
    int nor1 = nor - 1;
	
    /* finished initalization */
    /*read PAPI HW counters and caculate performance of input phase*/
    if ( PAPI_read_counters( values_i, NUM_EVENTS ) != PAPI_OK ){ 
 	 printf("fail to stop papi counter");
    }
    Lmirate[0] = (double) values_i[0] / values_i[1];
    end_usec_1 = PAPI_get_real_usec(); 
    mflops[0] = (double) values_i[2] / (end_usec_1-start_usec);
    util[0] = mflops[0] / PEAKPER;

    /* start computation loop */
    while (iter < 10000){

    /* start phase 1 */
    // update the old values of direc
    for (nc = nintci; nc <= nintcf; nc++){
	direc1[nc] = direc1[nc] + resvec[nc] * cgup[nc];
    }

    // compute new guess (approximation) for direc
    for (nc = nintci; nc <= nintcf; nc++){
	direc2[nc] = bp[nc] * direc1[nc] - bs[nc] * direc1[lcc[0][nc]]
			- bw[nc] * direc1[lcc[3][nc]] - bl[nc] * direc1[lcc[4][nc]]
			- bn[nc] * direc1[lcc[2][nc]] - be[nc] * direc1[lcc[1][nc]]
			- bh[nc] * direc1[lcc[5][nc]];
    } /* end phase 1 */
	
    /*  start phase 2 */
    // execute normalization steps
    double oc1, oc2, occ;
    if (nor1 == 1){
        oc1 = 0;
        occ = 0;
    for (nc = nintci; nc <= nintcf; nc++){	
	occ = occ + adxor1[nc] * direc2[nc];
    }
         oc1 = occ / cnorm[1];
    for (nc = nintci; nc <= nintcf; nc++){
	direc2[nc] = direc2[nc] - oc1 * adxor1[nc];
	direc1[nc] = direc1[nc] - oc1 * dxor1[nc];
    }
    if1++;
    }else if (nor1 == 2){
	oc1 = 0;
	occ = 0;
    for (nc = nintci; nc <= nintcf; nc++){
	occ = occ + adxor1[nc] * direc2[nc];
    }
	oc1 = occ / cnorm[1];
	oc2 = 0;
	occ = 0;
    for (nc = nintci; nc <= nintcf; nc++){
	occ = occ + adxor2[nc] * direc2[nc];
    }
        oc2 = occ / cnorm[2];
    for (nc = nintci; nc <= nintcf; nc++){
	direc2[nc] = direc2[nc] - oc1 * adxor1[nc] - oc2 * adxor2[nc];
	direc1[nc] = direc1[nc] - oc1 * dxor1[nc] - oc2 * dxor2[nc];
    }

    if2++;
    }

    cnorm[nor] = 0;
    double omega = 0;

    // compute the new residual
    for (nc = nintci; nc <= nintcf; nc++){
	cnorm[nor] = cnorm[nor] + direc2[nc] * direc2[nc];
	omega = omega + resvec[nc] * direc2[nc];
    }
    omega = omega / cnorm[nor];
    double resnew = 0.0;
    for (nc = nintci; nc <= nintcf; nc++){
	var[nc] = var[nc] + omega * direc1[nc];
	resvec[nc] = resvec[nc] - omega * direc2[nc];
	resnew = resnew + resvec[nc] * resvec[nc];
    }
    resnew = sqrt(resnew);
    ratio = resnew / resref;

    // exit on no improvements of residual
    if (ratio <= 1.0e-10){
	break;
    }
    iter++;

    // prepare additional arrays for the next iteration step
    if (nor == nomax){
	nor = 1;
    }else{
    if (nor == 1){
    for (nc = nintci; nc <= nintcf; nc++){
	dxor1[nc] = direc1[nc];	
	adxor1[nc] = direc2[nc];
    }
    } else if (nor == 2){
    for (nc = nintci; nc <= nintcf; nc++){
	 dxor2[nc] = direc1[nc];
  	 adxor2[nc] = direc2[nc];
    }
    }
         nor++;
    }
         nor1 = nor - 1;

    }/* end phase 2 */

    /* finished computation loop */
    /*read PAPI HW counters and caculate performance of computation phase*/
    end_cycles_2 = PAPI_get_real_cyc(); // Gets the ending time in clock cycles
    end_usec_2 = PAPI_get_real_usec(); // Gets the ending time in microseconds
    if ( PAPI_read_counters( values_c, NUM_EVENTS ) != PAPI_OK ){ 
         printf("fail to read papi counter");
    }

    Lmirate[1] = (double) values_c[0]/values_c[1];
    mflops[1] = (double) values_c[2] / ( end_usec_2-end_usec_1 );
    util[1] = mflops[1] / PEAKPER;
    /* write output file  */
    
    if ( write_result(file_in, file_out, nintci, nintcf, var, iter, ratio) != 0 )
    printf("error when trying to write to file %s\n", file_out);
    
    //transfer volume to mesh
    if (vol2mesh(nintci, nintcf, lcc, &nodeCnt, &points, &elems) != 0 ){ 
        printf("error when trying to converge topology to volume");
    }   
    //write output to vtk file    
    if (write_result_vtk(str1, nintci, nintcf, nodeCnt, points, elems, su) != 0){
       printf("error when write SU to vtk file");
    }
    if (write_result_vtk(str2, nintci, nintcf, nodeCnt, points, elems, var) != 0){
       printf("error when write VAR to vtk file");
    }
    if (write_result_vtk(str3, nintci, nintcf, nodeCnt, points, elems, cgup) != 0){
       printf("error when write CGUP to vtk file");
    }
    /*read PAPI HW counters and caculate performance of output phase*/  
    if ( PAPI_stop_counters( values_o, NUM_EVENTS ) != PAPI_OK ){ 
         printf("fail to stop papi counter");
    } 
    
    Lmirate[2] = (double) values_o[0]/values_o[1];
    end_cycles_3 = PAPI_get_real_cyc(); // Gets the ending time in clock cycles
    end_usec_3 = PAPI_get_real_usec(); // Gets the ending time in microseconds 
    mflops[2] = (double) (values_o[2])/(end_usec_3-end_usec_2);
    util[2] = mflops[2] / PEAKPER;
    /** Write all measured performance to pstats.dat*/
    et[0] = end_usec_1-start_usec;
    et[1] = end_usec_2-end_usec_1;
    et[2] = end_usec_3-end_usec_2;
    if (write_result_dat(file_perf, values_i,values_c, values_o,Lmirate, et, mflops, util) != 0 ){
        printf("error when write measured performance to data file");
    }
    /* Free all the dynamically allocated memory */
    free(direc2); free(direc1); free(dxor2); free(dxor1); free(adxor2); free(adxor1);
    free(cnorm); free(oc); free(var); free(cgup); free(resvec); free(su); free(bp);
    free(bh); free(bl); free(bw); free(bn); free(be); free(bs);
    printf("Simulation completed successfully!\n");
    return EXIT_SUCCESS;
    }
Пример #21
0
int main(int argc, char **argv) {

   int retval,quiet,result;

   int num_runs=100;
   long long high=0,low=0,average=0,expected=1000000;
   double error;
   int num_random_branches=500000;

   int i;
   int events[1];
   long long counts[1],total=0;

   char test_string[]="Testing PAPI_BR_PRC predefined event...";

   quiet=test_quiet();

   retval = PAPI_library_init(PAPI_VER_CURRENT);
   if (retval != PAPI_VER_CURRENT) {
      if (!quiet) printf("Error: PAPI_library_init %d\n", retval);
      test_fail(test_string);
   }

   retval = PAPI_query_event(PAPI_BR_PRC);
   if (retval != PAPI_OK) {
      if (!quiet) printf("PAPI_BR_PRC not supported %d\n", retval);
      test_skip(test_string);
   }

   if (!quiet) {
      printf("\n");
      printf("Testing a simple loop with %lld branches (%d times):\n",
          expected,num_runs);
      printf("Nearly all the branches should be predicted correctly.\n");
   }

   events[0]=PAPI_BR_PRC;
   high=0;
   low=0;

   for(i=0;i<num_runs;i++) {

     PAPI_start_counters(events,1);

     result=branches_testcode();

     PAPI_stop_counters(counts,1);

     if (result==CODE_UNIMPLEMENTED) {
       if (!quiet) printf("\tNo test code for this architecture\n");
       test_skip(test_string);
     }

     if (counts[0]>high) high=counts[0];
     if ((low==0) || (counts[0]<low)) low=counts[0];
     total+=counts[0];
   }

   average=total/num_runs;

   error=display_error(average,high,low,expected,quiet);

   if ((error > 1.0) || (error<-1.0)) {

      if (!quiet) printf("Instruction count off by more than 1%%\n");
      test_fail(test_string);

   }
   if (!quiet) printf("\n");

   /*******************/

   high=0; low=0; total=0;

   events[0]=PAPI_BR_CN;

   for(i=0;i<num_runs;i++) {

     PAPI_start_counters(events,1);

     result=random_branches_testcode(num_random_branches,1);

     PAPI_stop_counters(counts,1);

     if (counts[0]>high) high=counts[0];
     if ((low==0) || (counts[0]<low)) low=counts[0];
     total+=counts[0];
   }

   average=total/num_runs;

   expected=average;

   if (!quiet) {
      printf("\nTesting a function that branches based on a random number\n");
      printf("   The loop has %lld conditional branches.\n",expected);
      printf("   %d are random branches; %d of those were taken\n",num_random_branches,result);
   }

   high=0; low=0; total=0;

   events[0]=PAPI_BR_PRC;

   for(i=0;i<num_runs;i++) {

     PAPI_start_counters(events,1);

     result=random_branches_testcode(num_random_branches,1);

     PAPI_stop_counters(counts,1);

     if (counts[0]>high) high=counts[0];
     if ((low==0) || (counts[0]<low)) low=counts[0];
     total+=counts[0];
   }

   average=total/num_runs;

   if (!quiet) {

      printf("\nOut of %lld branches, %lld predicted correctly\n",expected,average);
      printf("Assuming a good random number generator and no freaky luck\n");
      printf("The TOTAL - CORRECT value is %lld\n",expected-average);
      printf("This value should be roughly between %d and %d\n",
             num_random_branches/4,(num_random_branches/4)*3);
   }

   if ( (expected-average) < (num_random_branches/4)) {
     if (!quiet) printf("Correct predicts too low\n");
     test_fail(test_string);
   }

   if ( (expected-average) > (num_random_branches/4)*3) { 

     if (!quiet) printf("Correct predicts too high\n");
     test_fail(test_string);
   }
   if (!quiet) printf("\n");

   PAPI_shutdown();

   test_pass(test_string);

   return 0;
}
Пример #22
0
int main(int argc, char *argv[]) {


	float rtime1, rtime2, ptime1, ptime2, mflops;
	long long flpops;

	unsigned long int tid;
	int num_hwcntrs = 0;
	int fip = 0, retval;
	float real_time, proc_time;
	long long flpins;

	int i;
	unsigned int EventSet = PAPI_NULL; 
    int count = 0, err_count = 0;


    PAPI_event_info_t info;

    long long ( values2[2] )[2];
    long long min, max;
    int PAPI_event, mythreshold = THRESHOLD;
    char event_name1[PAPI_MAX_STR_LEN];
    const PAPI_hw_info_t *hw_info = NULL;
    int num_events, mask;
    int num_flops = NUM_FLOPS;
    long long elapsed_us, elapsed_cyc;



tests_quiet( argc, argv );  /* Set TESTS_QUIET variable */



    retval = PAPI_library_init( PAPI_VER_CURRENT );
    if ( retval != PAPI_VER_CURRENT )
      test_fail( __FILE__, __LINE__, "PAPI_library_init", retval );

  retval = PAPI_create_eventset( &EventSet );
  if ( retval != PAPI_OK )
      test_fail( __FILE__, __LINE__, "PAPI_create_eventset", retval );

	/* Get hardware info */
  hw_info = PAPI_get_hardware_info(  );
  if ( hw_info == NULL )
      test_fail( __FILE__, __LINE__, "PAPI_get_hardware_info", 2 );

  EventSet = 	add_two_nonderived_events( &num_events, &PAPI_event, &mask );

  printf("Using %#x for the overflow event\n",PAPI_event);

  if ( PAPI_event == PAPI_FP_INS ) {
      mythreshold = THRESHOLD;
  }
  else {
		#if defined(linux)
      mythreshold = ( int ) hw_info->cpu_max_mhz * 20000;
		#else
      mythreshold = THRESHOLD * 2;
		#endif
  }

  retval = PAPI_start( EventSet );
  if ( retval != PAPI_OK )
      test_fail( __FILE__, __LINE__, "PAPI_start", retval );

  do_flops( NUM_FLOPS );

	/* stop the calibration run */
  retval = PAPI_stop( EventSet, values2[0] );
  if ( retval != PAPI_OK )
      test_fail( __FILE__, __LINE__, "PAPI_stop", retval );


	/* set up overflow handler */
  retval = PAPI_overflow( EventSet, PAPI_event, mythreshold, 0, handler );
  if ( retval != PAPI_OK ) {
      test_fail( __FILE__, __LINE__, "PAPI_overflow", retval );
  }

	/* Start overflow run */
  retval = PAPI_start( EventSet );
  if ( retval != PAPI_OK ) {
      test_fail( __FILE__, __LINE__, "PAPI_start", retval );
  }

  do_flops( num_flops );

	/* stop overflow run */
  retval = PAPI_stop( EventSet, values2[1] );
  if ( retval != PAPI_OK )
      test_fail( __FILE__, __LINE__, "PAPI_stop", retval );
  retval = PAPI_overflow( EventSet, PAPI_event, 0, 0, handler );
  if ( retval != PAPI_OK )
      test_fail( __FILE__, __LINE__, "PAPI_overflow", retval );

  if ( !TESTS_QUIET ) {
      if ( ( retval =
         PAPI_event_code_to_name( PAPI_event, event_name1 ) ) != PAPI_OK )
         test_fail( __FILE__, __LINE__, "PAPI_event_code_to_name", retval );

     printf( "Test case: Overflow dispatch of 2nd event in set with 2 events.\n" );
     printf( "---------------------------------------------------------------\n" );
     printf( "Threshold for overflow is: %d\n", mythreshold );
     printf( "Using %d iterations\n", num_flops );
     printf( "-----------------------------------------------\n" );

     printf( "Test type    : %16d%16d\n", 1, 2 );
     printf( OUT_FMT, event_name1, ( values2[0] )[1], ( values2[1] )[1] );
     printf( OUT_FMT, "PAPI_TOT_CYC", ( values2[0] )[0], ( values2[1] )[0] );
     printf( "Overflows    : %16s%16d\n", "", total );
     printf( "-----------------------------------------------\n" );
 }

 retval = PAPI_cleanup_eventset( EventSet );
 if ( retval != PAPI_OK )
  test_fail( __FILE__, __LINE__, "PAPI_cleanup_eventset", retval );

retval = PAPI_destroy_eventset( &EventSet );
if ( retval != PAPI_OK )
  test_fail( __FILE__, __LINE__, "PAPI_destroy_eventset", retval );

if ( !TESTS_QUIET ) {
  printf( "Verification:\n" );
#if defined(linux) || defined(__ia64__) || defined(_POWER4)
  num_flops *= 2;
#endif
  if ( PAPI_event == PAPI_FP_INS || PAPI_event == PAPI_FP_OPS ) {
     printf( "Row 1 approximately equals %d %d\n", num_flops, num_flops );
 }
 printf( "Column 1 approximately equals column 2\n" );
 printf( "Row 3 approximately equals %u +- %u %%\n",( unsigned ) ( ( values2[0] )[1] / ( long long ) mythreshold ),( unsigned ) ( OVR_TOLERANCE * 100.0 ) );
}

min =
( long long ) ( ( ( double ) values2[0][1] * ( 1.0 - OVR_TOLERANCE ) ) /
  ( double ) mythreshold );
max =
( long long ) ( ( ( double ) values2[0][1] * ( 1.0 + OVR_TOLERANCE ) ) /
  ( double ) mythreshold );
printf( "Overflows: total(%d) > max(%lld) || total(%d) < min(%lld) \n", total,
  max, total, min );
if ( total > max || total < min )
  test_fail( __FILE__, __LINE__, "Overflows", 1 );



printf("Initial thread id is: %lu\n",tid);

	/* Initialize the PAPI library and get the number of counters available */

if ((num_hwcntrs = PAPI_num_counters()) <= 0)  
  handle_error(1);



  /*  The installation supports PAPI, but has no counters */
if ((num_hwcntrs = PAPI_num_counters()) == 0 )
    fprintf(stderr,"Info:: This machine does not provide hardware counters.");

printf("This system has %d available counters.\n", num_hwcntrs);

if (num_hwcntrs > 2)
  num_hwcntrs = 2;

	 /* Start counting events */




if (PAPI_start_counters(Events, num_hwcntrs) != PAPI_OK)
  handle_error(1);

if (argc != 8) {
  printf("\nError :: Ejecutar como : a.out archivo_BD Num_elem archivo_queries Num_queries N_THREADS numero_K Dimension_objetos\n");
  return 0;
}
TOPK = atoi(argv[6]);
DIM = atoi(argv[7]);
double **DB;
	double **Consultas; //Cola de consultas
	int N_QUERIES, N_DB;
	char str_f[256];
	double dato[DIM];
	int j;
	FILE *f_dist, *fquery;
	Elem *heap, e_temp,*answer;
	int *acum, N_THREADS;


	//N_THREADS es el nro. de threads con el que se lanzará la región paralela
	N_THREADS = atoi(argv[5]);
	//N_QUERIES es el nro. de consultas
	N_QUERIES = atoi(argv[4]);
	N_DB = atoi(argv[2]);

	printf("\nN_QUERIES = %d\nN_THREADS = %d\n", N_QUERIES, N_THREADS);
	fflush(stdout);

	acum = (int *) malloc(sizeof (int)*N_THREADS);
	for (i = 0; i < N_THREADS; i++)
		acum[i] = 0;

	sprintf(str_f, "%s", argv[1]);
	printf("\nAbriendo %s... ", argv[1]);
	fflush(stdout);
	f_dist = fopen(str_f, "r");
	printf("OK\n");
	fflush(stdout);


	Consultas = (double **) malloc(sizeof (double *)*N_QUERIES);
	for (i = 0; i < N_QUERIES; i++)
		Consultas[i] = (double *) malloc(sizeof (double)*DIM);

	DB = (double **) malloc(sizeof (double *)*N_DB);
	for (i = 0; i < N_DB; i++)
		DB[i] = (double *) malloc(sizeof (double)*DIM);

	answer = (Elem *)malloc(sizeof(Elem)*N_QUERIES*TOPK);

	printf("\nCargando DB... ");
	fflush(stdout);
	for (i = 0; i < N_DB; i++) {
		//Usar leedato_cophir() cuando se utilice la BD Cophir para no tener problemas con las ","
		//if (leedato_cophir(dato, f_dist) == ERROR || feof(f_dist))
		if (leedato(dato, f_dist) == ERROR || feof(f_dist)) {
			printf("\n\nERROR :: N_DB mal establecido\n\n");
			fflush(stdout);
			fclose(f_dist);
			break;
		}
		copiavalor(DB[i], dato);
	}
	fclose(f_dist);
	printf("OK\n");
	fflush(stdout);

	if ((fquery = fopen(argv[3], "r")) == NULL)
		printf("Error al abrir para lectura el archivo de qeuries: %s\n", argv[3]);
	else
		printf("Abriendo  para lectura %s\n", argv[3]);
	printf("\nCargando Consultas... ");
	fflush(stdout);
	for (i = 0; i < N_QUERIES; i++) {
		//Usar leedato_cophir() cuando se utilice la BD Cophir para no tener problemas con las ","
		//if (leedato_cophir(dato, fquery) == ERROR || feof(fquery))
		if (leedato(dato, fquery) == ERROR || feof(fquery)) {
			printf("\n\nERROR :: N_QUERIES mal establecido, Menos queries que las indicadas\n\n");
			fflush(stdout);
			fclose(fquery);
			break;
		}
		copiavalor(Consultas[i], dato);
	}
	fclose(fquery);
	printf("OK\n");
	fflush(stdout);

	PAPI_start_counters((int*) Events, NUM_EVENTS);
	omp_set_num_threads(N_THREADS);

	elapsed_us = PAPI_get_real_usec(  );

	elapsed_cyc = PAPI_get_real_cyc(  );

	retval =
	PAPI_thread_init( ( unsigned
		long ( * )( void ) ) ( omp_get_thread_num ) );
	if ( retval != PAPI_OK ) {
		if ( retval == PAPI_ECMP )
			test_skip( __FILE__, __LINE__, "PAPI_thread_init", retval );
		else
			test_fail( __FILE__, __LINE__, "PAPI_thread_init", retval );
	}

#pragma omp parallel shared(Consultas, DB, N_QUERIES, N_DB, N_THREADS, acum, DIM)
	{
		float real_time;
		struct timeval t1, t2;
		int i, j;
		Elem *heap, e_temp;
		double d;
		int n_elem = 0;
		int trid = omp_get_thread_num(); //ID del thread
		int procs = omp_get_num_threads(); //Nro. total de threads
		double suma = 0;

		suma = 0;
		heap = (Elem *) malloc(sizeof (Elem) * TOPK);

#pragma omp barrier

#pragma omp master
		{
			gettimeofday(&t1, 0);
		}

		//Cada hilo accede a un subconjunto de las consultas. Cada hio accede de manera circular al arreglo de consultas.
        for (i = trid; i < N_QUERIES; i += procs) {
         n_elem = 0;
         for (j = 0; j < N_DB; j++) {

            d = distancia(Consultas[i], DB[j]);
				//Si la distancia del objeto a la consulta es menor que la raíz del heap, entonces se inserta en el heap. La raíz siempre mantiene la mayor de las distancias

            if(n_elem<TOPK){
               e_temp.dist = d;
               e_temp.ind = j;
               inserta2(heap, &e_temp, &n_elem);
           }
           if (n_elem==TOPK){
               if (d < topH(heap, &n_elem)) {
                  e_temp.dist = d;
                  e_temp.ind = j;
					//Si el heap no está lleno, se inserta el elemento
                  if (n_elem < TOPK)
                     inserta2(heap, &e_temp, &n_elem);
						//Si el heap está lleno, se inserta el elemento nuevo y se saca el que era antes de mayor de distancia. popush2() hace las operaciones de sacar el elemento mayor e insertar el nuevo.
                 else
                     popush2(heap, &n_elem, &e_temp);
             }}
         }

			//En este punto del código se tienen los K elemntos más cercanos a la consulta en 'heap'. Se pueden extraer con extraer2()
         for (j = 0; j < TOPK ; j++) {
           extrae2(heap, &n_elem, &e_temp);
           answer[i*TOPK+j].ind = e_temp.ind;
           answer[i*TOPK+j].dist = e_temp.dist;
       }
			//Realizamos una operación con los resultados para que el compilador no evite hacer instrucciones que considere que el usuario no utiliza. Simplemente cada hilo suma las distancias de los elementos mas cercanos a la consulta 
   }
   Thread( 1000000 * ( tid + 1 ) );



   fflush(stdout);

#pragma omp barrier

#pragma omp master
   {   

    if ( fip > 0 ) {
		/* Setup PAPI library and begin collecting data from the counters */
       if ( fip == 1 ) {
          if ( ( retval =
             PAPI_flips( &real_time, &proc_time, &flpins,
                &mflops ) ) < PAPI_OK )
             test_fail( __FILE__, __LINE__, "PAPI_flips", retval );
     } else {
      if ( ( retval =
         PAPI_flops( &real_time, &proc_time, &flpins,
            &mflops ) ) < PAPI_OK )
         test_fail( __FILE__, __LINE__, "PAPI_flops", retval );
 }

 gettimeofday(&t2, 0);
 real_time = (t2.tv_sec - t1.tv_sec) + (float) (t2.tv_usec - t1.tv_usec) / 1000000;

 Salida_Multihilo = fopen("Salida_Multihilo.txt", "w");
 for (i = 0; i < N_QUERIES; ++i){
  fprintf(Salida_Multihilo, "Consulta id:: %d\n",i);
  for (j = 0; j < TOPK; ++j){
     fprintf(Salida_Multihilo,"ind = %d :: dist = %f\n",answer[(i*TOPK)+j].ind,answer[(i*TOPK)+j].dist);
 }
 fprintf(Salida_Multihilo, "---------------------------------\n");
}
fclose(Salida_Multihilo);

printf("\n\nK = %d", TOPK);
printf("\nReal Time = %f segundos.\n", real_time);
fflush(stdout);


if ( fip == 1 ) {
  if ( ( retval =
     PAPI_flips( &real_time, &proc_time, &flpins,
        &mflops ) ) < PAPI_OK )
     test_fail( __FILE__, __LINE__, "PAPI_flips", retval );
} else {
  if ( ( retval =
     PAPI_flops( &real_time, &proc_time, &flpins,
        &mflops ) ) < PAPI_OK )
     test_fail( __FILE__, __LINE__, "PAPI_flops", retval );
}

if ( !TESTS_QUIET ) {
  if ( fip == 1 ) {
     printf( "Real_time: %f Proc_time: %f Total flpins: ", real_time,
        proc_time );
 } else {
     printf( "Real_time: %f Proc_time: %f Total flpops: ", real_time,
        proc_time );
 }
 printf( LLDFMT, flpins );
 printf( " MFLOPS: %f\n", mflops );
}
}

}
free(heap);



	}//end pragma omp parallel

	elapsed_cyc = PAPI_get_real_cyc(  ) - elapsed_cyc;
	elapsed_us = PAPI_get_real_usec(  ) - elapsed_us;

	if ( !TESTS_QUIET ) {
		printf( "Master real usec   : \t%lld\n", elapsed_us );
		printf( "Master real cycles : \t%lld\n", elapsed_cyc );
	}

	const PAPI_hw_info_t *hwinfo = NULL;
	const PAPI_mh_tlb_info_t *mhinfo = NULL;
	const  PAPI_mh_cache_info_t *mhcacheinfo = NULL;
	const PAPI_mh_level_t *mhlevel = NULL;


	if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT)
		exit(1);
	if ((hwinfo = PAPI_get_hardware_info()) == NULL)
		exit(1);	
	if ((mhinfo = PAPI_get_hardware_info()) == NULL)
		exit(1);
	if ((mhcacheinfo = PAPI_get_hardware_info()) == NULL)
		exit(1);
	if ((mhlevel = PAPI_get_hardware_info()) == NULL)
		exit(1);

	printf("\n\nA continuación información actual del equipo\n\n");

	printf("MH Type %d - Num entries %d  - Associativity %d \n",mhinfo->type, mhinfo->num_entries, mhinfo->associativity);
	printf("Cache MH type %d size %d line size %d num_lines %d Associativity %d\n\n",mhcacheinfo->type, mhcacheinfo->size,mhcacheinfo->line_size, mhcacheinfo->num_lines, mhcacheinfo->associativity);



    retval=papi_print_header("Available PAPI preset and user defined events plus hardware information.\n",&hwinfo );


    printf("Total hardware flops = %lld\n",(float)values[1]);
    printf("L2 data cache misses is %lld\n", values[0]);






    retval = PAPI_stop_counters(values, NUM_EVENTS);
    return 0;
}
Пример #23
0
void endTraceBigSim_20param(char * eventname, int stepno, int num_params, double p1 , double p2 , double p3 , double p4 , double p5 , double p6 , double p7 , double p8 , double p9 , double p10 , double p11 , double p12 , double p13 , double p14 , double p15 , double p16 , double p17 , double p18 , double p19 , double p20 ) {

#if WITH_MAMBO
    end_time=end();
	 //	double endTime = CmiWallTimer();
#else
    CkpvAccess(end_time) = CmiWallTimer();
#endif

    CkAssert(CkpvAccess(insideTraceBracket) == true);
    CkpvAccess(insideTraceBracket) = false;
#ifdef CMK_BIGSIM_CHARM
    char perfCountString[1024]; 
    perfCountString[0] = 0; 
#endif
	char params[2048];

if(num_params==0) sprintf(params, "");
if(num_params==1) sprintf(params, "%f", p1);
if(num_params==2) sprintf(params, "%f %f", p1, p2);
if(num_params==3) sprintf(params, "%f %f %f", p1, p2, p3);
if(num_params==4) sprintf(params, "%f %f %f %f", p1, p2, p3, p4);
if(num_params==5) sprintf(params, "%f %f %f %f %f", p1, p2, p3, p4, p5);
if(num_params==6) sprintf(params, "%f %f %f %f %f %f", p1, p2, p3, p4, p5, p6);
if(num_params==7) sprintf(params, "%f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7);
if(num_params==8) sprintf(params, "%f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8);
if(num_params==9) sprintf(params, "%f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9);
if(num_params==10) sprintf(params, "%f %f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9, p10);
if(num_params==11) sprintf(params, "%f %f %f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11);
if(num_params==12) sprintf(params, "%f %f %f %f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12);
if(num_params==13) sprintf(params, "%f %f %f %f %f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13);
if(num_params==14) sprintf(params, "%f %f %f %f %f %f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14);
if(num_params==15) sprintf(params, "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15);
if(num_params==16) sprintf(params, "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16);
if(num_params==17) sprintf(params, "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17);
if(num_params==18) sprintf(params, "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18);
if(num_params==19) sprintf(params, "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19);
if(num_params==20) sprintf(params, "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f", p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20);

	char paramString[2048];
	sprintf(paramString, "params:{ %s }", params);
	
	char eventNameString[1024];
	sprintf(eventNameString, "event:{ %s }", eventname);
 
#ifdef BIGSIM_PAPI
        CkAssert(PAPI_stop_counters(values, NUM_PAPI_EVENTS) == PAPI_OK); 

        sprintf(perfCountString, " PAPI:{ " ); 

	for(int i=0;i<NUM_PAPI_EVENTS;i++){
		sprintf(perfCountString+strlen(perfCountString), " %lld ", values[i] );
	}


	 printf("value=%lld\n", values[0]);

	 sprintf(perfCountString+strlen(perfCountString), " }");
	
#endif

 
  char timeString[512];
  timeString[0] = 0;
  char stepString[128];
  stepString[0] = 0;
  sprintf(stepString, "step:{ %d }", stepno);

#if ! CMK_BIGSIM_CHARM
#if WITH_MAMBO
  //  sprintf(timeString, "time:{ %f }", endTime-startTime);
  sprintf(timeString, "time_in_cycles:{ %llu }",  end_time-start_time); 
#endif
#endif

  if (CkpvAccess(bgfp) == NULL) {
  if (CkpvAccess(outputParameters)) {
  double t = CkpvAccess(end_time)-CkpvAccess(start_time);
if (t<0.0) {
    CmiPrintf("time: %f\n", t);
    t = 0.0;
}
  CmiAssert(t >= 0.0);

  sprintf(timeString, "time_in_us:{ %lf } %s %s %s\n",  t*1e6, eventNameString, stepString, paramString);
  CkpvAccess(eventsPool).insert(timeString);
  }
  }


#if SPLIT_APART_CYCLE_ACCURATE
  SimParameters *simParams = Node::Object()->simParameters;
	  if(simParams->bgSplitNumProcs != -1 && simParams->bgSplitMyProc!=-1){
	if( ((bgTraceCounter) % simParams->bgSplitNumProcs) == simParams->bgSplitMyProc){
	  // Do slow mambo simulation for this case!
	  // Counter is incremented only in startTraceBigSim()
	}
  }
#endif
#ifdef CMK_BIGSIM_CHARM

  char sequenceString[128];
  sequenceString[0] = 0;

  BgMark("endTraceBigSim %f\n");
  if (CkpvAccess(bgfp) != NULL) {
  // write event ID
  int seqno = tTIMELINEREC.length()-1;
  if (seqno<0) CkAbort("Traces are not generated. Please run emulation with +bglog");
  fprintf(CkpvAccess(bgfp),"%d ",seqno);
  sprintf(sequenceString, "seqno:{ %d } ",seqno);
//  fprintf(CkpvAccess(bgfp),"%s\n",params);
  fprintf(CkpvAccess(bgfp), "TRACEBIGSIM: %s %s %s %s %s %s\n", eventNameString, stepString, sequenceString, timeString, perfCountString, paramString);
  }
#else
/*
//  printf("TRACEBIGSIM: %s %s %s %s %s\n", eventNameString, sequenceString, timeString, perfCountString, paramString);
  if (CkpvAccess(bgfp) != NULL) {
  fprintf(CkpvAccess(bgfp), "TRACEBIGSIM: %s %s %s %s %s\n", eventNameString, sequenceString, timeString, perfCountString, paramString);
  }
*/
#endif


}
Пример #24
0
int main(int argc, char **argv) {
   
   int retval,quiet;
   const PAPI_hw_info_t *info;

   int events[2];
   long long counts[2];
   double error;
   long long expected;

   char test_string[]="Testing core2_constraints...";

   quiet=test_quiet();

   retval = PAPI_library_init(PAPI_VER_CURRENT);
   if (retval != PAPI_VER_CURRENT) {
     if (!quiet) printf("ERROR: PAPI_library_init %d\n", retval);
        test_fail(test_string);
   }

   if ( (info=PAPI_get_hardware_info())==NULL) {
     if (!quiet) printf("cannot obtain hardware info %d\n",retval);
      test_fail(test_string);
   }

   if ((info->vendor==PAPI_VENDOR_INTEL) && (info->cpuid_family==6) && 
			((info->cpuid_model==15) || (info->cpuid_model==23) || (info->cpuid_model==29))) {

     if (!quiet) printf("Found core2!\n");
   }
   else {
     if (!quiet) printf("Not a core2.\n");
     test_skip(test_string);
   }

   expected=naive_matrix_multiply_estimated_flops(quiet);

   retval=PAPI_event_name_to_code("FP_COMP_OPS_EXE",&events[0]);
   if (retval!=PAPI_OK) {
      if (!quiet) printf("PAPI_event_name_to_code %d\n", retval);      
      test_fail(test_string);
   }

   events[1]=PAPI_TOT_INS;

   PAPI_start_counters(events,2);

   naive_matrix_multiply(quiet);

   PAPI_stop_counters(counts,2);

   error=(((double)counts[0]-(double)expected)/(double)expected)*100.0;
   if (!quiet) printf("   Expected: %lld  Actual: %lld   Error: %.2lf\n", 
             expected, counts[0],error);

   if (error > 1.0) {
      if (!quiet) printf("FP error higher than expected\n");
      test_fail(test_string);
   }

   /* set FP_COMP_OPS_EXE to be in slot 2 */

   retval=PAPI_event_name_to_code("FP_COMP_OPS_EXE",&events[1]);
   if (retval!=PAPI_OK) {
      if (!quiet) printf("PAPI_event_name_to_code %d\n",retval);
      test_fail(test_string);
   }
   events[0]=events[1];

   PAPI_start_counters(events,2);

   naive_matrix_multiply(quiet);

   PAPI_stop_counters(counts,2);
   
   error=(((double)counts[1]-(double)expected)/(double)expected)*100.0;
   if (!quiet) printf("   Expected: %lld  Actual: %lld   Error: %.2lf\n", 
             expected, counts[1],error);

   if (error > 1.0) {
      if (!quiet) printf("FP error higher than expected\n");
      test_fail(test_string);
   }

   PAPI_shutdown();

   test_pass(test_string);

   return 0;
}
Пример #25
0
int main(int argc, char *argv[])
{
  int size, rank, world_rank, my_group;
  int num_lsms; // number of parallel LSMS instances
  int size_lsms; // number of atoms in a lsms instance
  int num_steps; // number of energy calculations
  int initial_steps; // number of steps before sampling starts
  int stepCount=0; // count the Monte Carlo steps executed
  double max_time; // maximum walltime for this run in seconds
  bool restrict_time = false;       // was the maximum time specified?
  bool restrict_steps = false; // or the max. numer of steps?
  int align; // alignment of lsms_instances
  
  double magnetization;
  double energy_accumulator; // accumulates the enegy to calculate the mean
  int energies_accumulated;


  int new_peid,new_root;
  static int op,flag;
  double *evec,*r_values;
  evec=(double *)shmalloc(sizeof(double)*3*size_lsms);
  r_values=(double *)shmalloc(sizeof(double)*(R_VALUE_OFFSET+3*(size_lsms+1)));




  energy_accumulator=0.0;
  energies_accumulated=0;

  double walltime_0,walltime;

  double restartWriteFrequency=30.0*60.0;
  double nextWriteTime=restartWriteFrequency;

  MPI_Comm local_comm;
  int *lsms_rank0;
  MPI_Status status;

  char prefix[40];
  char i_lsms_name[64];
  char gWL_in_name[64], gWL_out_name[64];
  char mode_name[64];
  char energy_calculation_name[64];
  char stupid[37];

  char step_out_name[64];
  char wl_step_out_name[128];
  char *wl_stepf=NULL;
  bool step_out_flag=false;
  std::ofstream step_out_file;
  typedef enum {Constant, Random, WangLandau_1d, ExhaustiveIsing, WangLandau_2d} EvecGenerationMode;
  typedef enum {MagneticMoment, MagneticMomentZ, MagneticMomentX, MagneticMomentY} SecondDimension;

  EvecGenerationMode evec_generation_mode = Constant;
  SecondDimension second_dimension = MagneticMoment;
  double ev0[3];

  bool return_moments_flag=true; // true-> return all magnetic moments from lsms run at each step.
  bool generator_needs_moment=false;

  typedef enum {OneStepEnergy, MultiStepEnergy, ScfEnergy} EnergyCalculationMode;
  EnergyCalculationMode energyCalculationMode = OneStepEnergy;
  int energyIndex=1; // index for the return value to use for the MC step (0: total energy, 1: band energy)

  ev0[0]=ev0[1]=0.0; ev0[2]=1.0;
  // size has to be align + size_lsms*num_lsms
  align=1;
  num_lsms=1;
  size_lsms=-1;
  my_group=-1;
  num_steps=1;
  initial_steps=0;

  sprintf(i_lsms_name,"i_lsms");
  gWL_in_name[0]=gWL_out_name[0]=0;
  mode_name[0]=0;
  energy_calculation_name[0]=0;

  // check command line arguments
  for(int i=0; i<argc; i++)
  {
    if(!strcmp("-num_lsms",argv[i])) num_lsms=atoi(argv[++i]);
    if(!strcmp("-size_lsms",argv[i])) size_lsms=atoi(argv[++i]);
    if(!strcmp("-align",argv[i])) align=atoi(argv[++i]);
    if(!strcmp("-num_steps",argv[i])) {num_steps=atoi(argv[++i]); restrict_steps=true;}
    if(!strcmp("-initial_steps",argv[i])) initial_steps=atoi(argv[++i]); 
    if(!strcmp("-walltime",argv[i])) {max_time=60.0*atof(argv[++i]); restrict_time=true;}
    if(!strcmp("-i",argv[i])) strncpy(i_lsms_name,argv[++i],64);
    if(!strcmp("-random_dir",argv[i])) {evec_generation_mode = Random;}
    if(!strcmp("-step_out",argv[i]))
    {strncpy(step_out_name,argv[++i],64); step_out_flag=true;
      return_moments_flag=true;}
    if(!strcmp("-wl_out", argv[i])) strncpy(gWL_out_name,argv[++i],64);
    if(!strcmp("-wl_in", argv[i])) strncpy(gWL_in_name,argv[++i],64);
    if(!strcmp("-mode", argv[i])) strncpy(mode_name,argv[++i],64);
    if(!strcmp("-energy_calculation",argv[i])) strncpy(energy_calculation_name,argv[++i],64);
  }

  if(!(restrict_steps || restrict_time)) restrict_steps=true;

  if(mode_name[0]!=0)
  {
    if(!strcmp("constant",mode_name)) evec_generation_mode = Constant;
    if(!strcmp("random",mode_name)) evec_generation_mode = Random;
    if(!strcmp("1d",mode_name)) evec_generation_mode = WangLandau_1d;
    if(!strcmp("ising",mode_name)) evec_generation_mode = ExhaustiveIsing;
    if(!strcmp("2d",mode_name)) evec_generation_mode = WangLandau_2d;
    if(!strcmp("2d-m",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMoment;}
    if(!strcmp("2d-x",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentX;}
    if(!strcmp("2d-y",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentY;}
    if(!strcmp("2d-z",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentZ;}
  }

  if(energy_calculation_name[0]!=0)
  {
    if(energy_calculation_name[0]=='o') { energyCalculationMode = OneStepEnergy; energyIndex=1; }
    if(energy_calculation_name[0]=='m') { energyCalculationMode = MultiStepEnergy; energyIndex=1; }
    if(energy_calculation_name[0]=='s') { energyCalculationMode = ScfEnergy; energyIndex=0; }
  }

#ifdef USE_PAPI
#define NUM_PAPI_EVENTS 4
  int hw_counters = PAPI_num_counters();
  if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS;
  int papi_events[NUM_PAPI_EVENTS]; // = {PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_FP_OPS,PAPI_VEC_INS};
  char *papi_event_name[] = {"PAPI_TOT_INS","PAPI_FP_OPS",
                             "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:OP_TYPE",
                             "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:OP_TYPE"};
  // "RETIRED_INSTRUCTIONS",
  // "RETIRED_MMX_AND_FP_INSTRUCTIONS:PACKED_SSE_AND_SSE2",
  // "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:1",
  // "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:1"
  // get events from names:
  for(int i=0; i<NUM_PAPI_EVENTS; i++)
  {
    if(PAPI_event_name_to_code(papi_event_name[i],&papi_events[i]) != PAPI_OK)
    {
      // printline("Error in obtaining PAPI event code for: "+ttos(papi_event_name[i]),
      //           std::cerr,parameters.myrankWorld);
      // printline("Skipping all following events",
      //           std::cerr,parameters.myrankWorld);
      if(hw_counters>i) hw_counters=i;
    }
  }
  long long papi_values[NUM_PAPI_EVENTS+4];
  // printline("PAPI: "+ttos(hw_counters)+" counters available",std::cout,parameters.myrankWorld);
  if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS;
  long long papi_real_cyc_0 = PAPI_get_real_cyc();
  long long papi_real_usec_0 = PAPI_get_real_usec();
  long long papi_virt_cyc_0 = PAPI_get_virt_cyc();
  long long papi_virt_usec_0 = PAPI_get_virt_usec();
  PAPI_start_counters(papi_events,hw_counters);
#endif


  lsms_rank0=(int *)malloc(sizeof(int)*(num_lsms+1));

  // initialize MPI:
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  world_rank=rank;
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  walltime_0 = get_rtc();

#ifndef SVN_REV
#define SVN_REV "unknown"
#endif

// make sure 'return_moments_flag' is set correctly
  switch(evec_generation_mode)
  {
  case Constant : break;
  case Random : break;
  case WangLandau_1d :
    return_moments_flag = true;
    generator_needs_moment = true;
    break;
  case ExhaustiveIsing : break;
  case WangLandau_2d :
    return_moments_flag = true;
    generator_needs_moment = true;
    break;
  default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1);
  }

  if(rank==0)
  {
    std::cout<<"LSMS_3"<<std::endl;
    std::cout<<" SVN revision "<<SVN_REV<<std::endl<<std::endl;
#ifdef USE_PAPI
    std::cout<<" Using Papi counters"<<std::endl<<std::endl; 
#endif
    std::cout<<" Size of LSMS instances = "<<size_lsms<<" atoms\n";
    std::cout<<" Number of LSMS instances = "<<num_lsms<<std::endl;
    std::cout<<" LSMS Energy calculated using ";
    switch(energyCalculationMode)
    {
    case OneStepEnergy: std::cout<<"oneStepEnergy [frozen potential band energy]"<<std::endl; break;
    case MultiStepEnergy: std::cout<<"multiStepEnergy [frozen potential band energy with converged Fermi energy]"<<std::endl; break;
    case ScfEnergy: std::cout<<"scfEnergy [self-consistent total energy]"<<std::endl; break;
    default: std::cout<<"UNKNOWN ENERGY CALCULATION METHOD"<<std::endl; exit(1);
    }
    if(restrict_steps) std::cout<<" Number of gWL steps = "<<num_steps<<std::endl;
    if(restrict_time) std::cout<<" Maximum walltime = "<<max_time<<"s\n";
    std::cout<<" Processor alignment (process allocation quantization) = "<<align<<std::endl;
    switch(evec_generation_mode)
    {
    case Constant : std::cout<<" Constant moments direction along "
                             <<ev0[0]<<" "<<ev0[1]<<" "<<ev0[2]<<std::endl;
      break;
    case Random : std::cout<<" Random distribution of moments (no Wang-Landau)"<<std::endl;
      break;
    case WangLandau_1d : std::cout<<" Wang-Landau for one continuous variable (energy)"<<std::endl;
//      return_moments_flag = true;
//      generator_needs_moment = true;
      break;
    case ExhaustiveIsing : std::cout<<" Exhaustive Ising sampling"<<std::endl; break;
    case WangLandau_2d : std::cout<<" Wang-Landau for two continuous variable (energy, ";
      switch(second_dimension)
      {
      case MagneticMoment  : std::cout<<"magnitude of magnetization)"; break;
      case MagneticMomentX : std::cout<<"x component of magnetization)"; break;
      case MagneticMomentY : std::cout<<"y component of magnetization)"; break;
      case MagneticMomentZ : std::cout<<"z component of magnetization)"; break;
      }
      std::cout<<std::endl;
//      return_moments_flag = true;
//      generator_needs_moment = true;
      break;
    default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1);
    }
    if(step_out_flag) std::cout<<" Step output written to: "<<step_out_name<<std::endl;
    std::cout<<std::endl;

    if(step_out_flag && (evec_generation_mode==WangLandau_1d))
    {
      // step_out_flag=false;
      snprintf(wl_step_out_name,127,"wl1d_%s",step_out_name);
      wl_stepf=wl_step_out_name;
    }

    if(step_out_flag)
    {
      step_out_file.open(step_out_name);
      step_out_file<<"#";
      for(int i=0; i<argc; i++) step_out_file<<" "<<argv[i];
      step_out_file<<std::endl<<size_lsms<<std::endl;
    }
  }

  if(generator_needs_moment) return_moments_flag=true;

  if(num_lsms==1)
  {
    SHMEM_activeset local_comm;
    local_comm.rank=shmem_my_pe();
    local_comm.size=shmem_n_pes();
    local_comm.start_pe=0;
    local_comm.logPE_stride=0;
    LSMS lsms_calc(local_comm,i_lsms_name,"1_");
      
    if(rank==0)
    {
      std::cout<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n";
      std::cout<<"  LSMS version = "<<lsms_calc.version()<<std::endl;
    }

    if(energyCalculationMode==OneStepEnergy)
      std::cout<<"one step Energy = "<<lsms_calc.oneStepEnergy()<<std::endl;
    else if(energyCalculationMode==MultiStepEnergy)
      std::cout<<"multi-step Energy = "<<lsms_calc.multiStepEnergy()<<std::endl;
    else if(energyCalculationMode==ScfEnergy)
      std::cout<<"self-consistent Energy = "<<lsms_calc.scfEnergy()<<std::endl;
    else
    {
      printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n");
     // MPI_Abort(MPI_COMM_WORLD,5);
      exit(5);
    }
  }
  else
  {
    // build the communicators
    //int color=MPI_UNDEFINED;
    //Assuming user passes a power of two while using "-align"
    int s = align;
    int comm_size=(size-align)/num_lsms;
    int world_rank;
    for(int i=0; i<num_lsms; i++)
    {
      if((world_rank>=s) && (world_rank<s+comm_size)) 
      { 
        my_group=i; 
        //color=i; 
        new_peid=world_rank-s;
        new_root=s;
      }
      lsms_rank0[i]=s;
      s+=comm_size;
    }
    if(world_rank==0){ 
      //color=num_lsms;
      new_peid=0;
      comm_size=1;
      new_root=0;
    }

    //MPI_Comm_split(MPI_COMM_WORLD, color, 0, &local_comm);
    SHMEM_activeset local_comm;
    local_comm.rank=new_peid;
    local_comm.size=comm_size;
    local_comm.start_pe=new_root;
    local_comm.logPE_stride=0;

    std::cout<<"world_rank="<<world_rank<<" -> group="<<my_group<<std::endl;

      
    snprintf(prefix,38,"Group %4d: ",my_group);

    // now we get ready to do some calculations...

    if(my_group>=0)
    {
      double energy;
      double band_energy;
      int static i_values[10];
      double static r_values[10];
      static int op;


      //MPI_Comm_rank(local_comm, &rank);
      rank = local_comm.rank;
      snprintf(prefix,38,"%d_",my_group);
      // to use the ramdisk on jaguarpf:
      // snprintf(prefix,38,"/tmp/ompi/%d_",my_group);
      LSMS lsms_calc(local_comm,i_lsms_name,prefix);
      snprintf(prefix,38,"Group %4d: ",my_group);

      if(rank==0 && my_group==0)
      {
        std::cout<<prefix<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n";
        std::cout<<prefix<<"  LSMS version = "<<lsms_calc.version()<<std::endl;
      }

      // wait for commands from master
      bool finished=false;
      while(!finished)
      {
        if(rank==0)
        {
          //MPI_Recv(evec,3*size_lsms,MPI_DOUBLE,0,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          //op =status.MPI_TAG;
          if (lsms_rank0[0]==world_rank)
                shmem_barrier(0, lsms_rank0[0], 2, pSync1);

        }
        //MPI_Bcast(&op,1,MPI_INT,0,local_comm);
        shmem_broadcast32(&op, &op, 1, local_comm.start_pe, local_comm.start_pe, local_comm.logPE_stride, local_comm.size, pSync2); 

/* recognized opcodes:
   5: calculate energy

   recognized energy calculation modes:
   OneStepEnergy : calclulate frozen potential band energy in one step (don't converge Ef)
   use only if the Fermi energy will not change due to MC steps!
   The only method available in LSMS_1.9
   MultiStepEnergy : calculate frozen potential band energy after converging Fermi energy
   This should be the new default method. If the Fermi energy doesn't change
   multiStepEnergy only performs one step and should be equivalent to oneStepEnergy
   The tolerance for Ef convergence can be set with LSMS::setEfTol(Real).
   The default tolerance is set in the LSMS::LSMS constructor (currently 1.0e-6).
   The maximum number of steps is read from the LSMS input file 'nscf' parameter.
   ScfEnergy : this will calculate the selfconsistent total energy.
   The maximum number of steps is read from the LSMS input file 'nscf' parameter.
   NOT IMPLEMENTED YET!!!

   10: get number of sites
*/

        if(op==5)
        {
          lsms_calc.setEvec(evec);
          if(energyCalculationMode==OneStepEnergy)
            energy=lsms_calc.oneStepEnergy(&band_energy);
          else if(energyCalculationMode==MultiStepEnergy)
            band_energy=energy=lsms_calc.multiStepEnergy();
          else if(energyCalculationMode==ScfEnergy)
            energy=lsms_calc.scfEnergy(&band_energy);
          else
          {
            printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n");
            //MPI_Abort(MPI_COMM_WORLD,5);
            exit(5);
          }
          r_values[0]=energy;
          r_values[1]=band_energy;
          if(return_moments_flag)
          {
            lsms_calc.getMag(&r_values[R_VALUE_OFFSET]);
          }
          if(rank==0)
          {
            if(return_moments_flag)
            {
              //MPI_Send(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,0,1005,MPI_COMM_WORLD);
              shmem_double_put(r_values, r_values, R_VALUE_OFFSET+3*size_lsms, 0);

            } else {
              //MPI_Send(r_values,R_VALUE_OFFSET,MPI_DOUBLE,0,1005,MPI_COMM_WORLD);
              shmem_double_put(r_values, r_values, R_VALUE_OFFSET, 0);
            }
            shmem_fence();
            shmem_int_swap(&flag, world_rank, 0);

          }
              
        } else if(op==10) {
          i_values[0]=lsms_calc.numSpins();
          //MPI_Send(i_values,10,MPI_INT,0,1010,MPI_COMM_WORLD);
          shmem_int_put(i_values, i_values, 10, 0);
        } else {
          // printf("world rank %d: recieved exit\n",world_rank); 
          finished=true;
        }
      }

      shfree(evec);
      //shfree(r_values);
    }
    else if(world_rank==0)
    {
      int running;
      double **evecs;
      //double *r_values;
      //int i_values[10];
      int *init_steps;
      int total_init_steps;
      bool accepted;
        
      char *wl_inf=NULL;
      char *wl_outf=NULL;
      if(gWL_in_name) wl_inf=gWL_in_name;
      if(gWL_out_name) wl_outf=gWL_out_name;
        
      EvecGenerator *generator;

/*
      // get number of spins from first LSMS instance
      // temp r_values:
      r_values=(double *)malloc(sizeof(double)*10);
      MPI_Send(r_values,1,MPI_DOUBLE, lsms_rank0[0], 10, MPI_COMM_WORLD);
      free(r_values);
      MPI_Recv(i_values,10,MPI_INT,lsms_rank0[0],1010,MPI_COMM_WORLD,&status);
      if(i_values[0]!=size_lsms)
      {
        printf("Size specified for Wang-Landau and in LSMS input file don't match!\n");
        size_lsms=i_values[0];
      }
*/

      evecs=(double **)shmalloc(sizeof(double *)*num_lsms);
      init_steps=(int *)shmalloc(sizeof(int)*num_lsms);
      for(int i=0; i<num_lsms; i++)
      {
        evecs[i]=(double *)shmalloc(sizeof(double)*3*size_lsms);
        init_steps[i]=initial_steps;
      }
      total_init_steps=num_lsms*initial_steps;
        

      // Initialize the correct evec generator
      switch(evec_generation_mode)
      {
      case Random :  generator = new RandomEvecGenerator(size_lsms);
        break;
      case Constant: generator = new ConstantEvecGenerator(size_lsms, ev0, num_lsms);
        break;
     //case WangLandau_1d : generator = new WL1dEvecGenerator<std::mt19937>(size_lsms, num_lsms,
     //                                                                      evecs, wl_inf, wl_outf, wl_stepf);
     case WangLandau_1d : generator = new WL1dEvecGenerator<boost::mt19937>(size_lsms, num_lsms,
                                                                           evecs, wl_inf, wl_outf, wl_stepf);
        break;
      case ExhaustiveIsing : generator = new ExhaustiveIsing1dEvecGenerator(size_lsms, num_lsms,
                                                                            evecs, wl_inf, wl_outf);
        break;
      //case WangLandau_2d : generator = new WL2dEvecGenerator<std::mt19937>(size_lsms, num_lsms,
      //                                                                     evecs, wl_inf, wl_outf, wl_stepf);
      case WangLandau_2d : generator = new WL2dEvecGenerator<boost::mt19937>(size_lsms, num_lsms,
                                                                           evecs, wl_inf, wl_outf, wl_stepf);
        break;
      default: std::cerr<<"The code should never arrive here: UNKNOWN EVEC GENERATION MODE\n";
        exit(1);
      }

      for(int i=0; i<num_lsms; i++)
      {
        generator->initializeEvec(i,evecs[i]);
      }
      std::cout<<"This is the master node\n";
      // issue initial commands to all LSMS instances
      running=0;
      bool more_work=true;
      if(total_init_steps>0)
      {
        for(int i=0; i<num_lsms; i++)
        {
          std::cout<<"starting initial calculation in group "<<i<<std::endl;
          //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD);
          shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]);
          shmem_int_p(&op, 5, lsms_rank0[i]);
          shmem_fence();


          num_steps--; running++; stepCount++;
          if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
        }
        shmem_barrier(0, lsms_rank0[0], 2, pSync1);
        // first deal with the initial steps:
        while(running>0)
        {
          //if(return_moments_flag)
          //  MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          //else
          //  MPI_Recv(r_values,R_VALUE_OFFSET,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          
          shmem_int_wait(&flag,-1);

          running--;
          // std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl;
          // std::cout<<"    band energy E_band="<<r_values[1]<<std::endl;
          if(total_init_steps>0)
          {
            //int r_group=(status.MPI_SOURCE-align)/comm_size;
            int r_group=(flag-align)/comm_size;
            std::cout<<"starting additional calculation in group "<<r_group<<std::endl;

            if(init_steps[r_group]>0)
            {
              more_work = !(generator->generateUnsampledEvec(r_group,evecs[r_group],r_values[energyIndex]));
              init_steps[r_group]--; total_init_steps--;
            }
                
            //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD);
            shmem_double_put(r_values, evecs[r_group],  3*size_lsms, lsms_rank0[r_group]); //TODO check this
            shmem_fence();
                
            num_steps--; running++; stepCount++;
            if(restrict_steps && num_steps<=0) more_work=false;
            if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
            walltime = get_rtc() - walltime_0;
            if(restrict_time && walltime>=max_time) more_work=false;
            if(restrict_time) std::cout<<"      "<<max_time-walltime<<" seconds remaining\n";
          }
              
        }
      }
      more_work=true;
      running=0;
      for(int i=0; i<num_lsms; i++)
      {
        std::cout<<"starting main calculation in group "<<i<<std::endl;
        //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD);
        shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]);
        shmem_int_p(&op, 5, lsms_rank0[i]);
        shmem_fence();
        num_steps--; running++; stepCount++;
        if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
      }
      shmem_barrier(0, lsms_rank0[0], 2, pSync1);
        
      generator->startSampling();
      // wait for results and issue new commands or wind down
      while(running>0)
      {
        //MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
        shmem_int_wait(&flag,-1);

        running--;
        std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl;
        std::cout<<"    band energy E_band="<<r_values[1]<<std::endl;
        // printf("from status.MPI_SOURCE=%d\n",status.MPI_SOURCE);
        energy_accumulator+=r_values[0]; energies_accumulated++;
        if(more_work)
        {
          int r_group=(status.MPI_SOURCE-align)/comm_size;
          std::cout<<"starting additional calculation in group "<<r_group<<std::endl;
              
          if(generator_needs_moment)
          {
            double m0,m1,m2;
            m0=0.0; m1=0.0; m2=0.0;
            for(int i=0; i<3*size_lsms; i+=3)
            {
              m0+=r_values[R_VALUE_OFFSET+i];
              m1+=r_values[R_VALUE_OFFSET+i+1];
              m2+=r_values[R_VALUE_OFFSET+i+2];
            }
            switch(second_dimension)
            {
            case  MagneticMoment : magnetization=std::sqrt(m0*m0+m1*m1+m2*m2); break;
            case  MagneticMomentX : magnetization=m0; break;
            case  MagneticMomentY : magnetization=m1; break;
            case  MagneticMomentZ : magnetization=m2; break;
            }
            if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex],magnetization, &accepted))
              more_work=false;
          } else {
            if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex], &accepted)) more_work=false;
          }

          //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD);
          shmem_double_put(r_values, evecs[r_group],  3*size_lsms, lsms_rank0[r_group]); //TODO check this
          shmem_fence();

          num_steps--; running++; stepCount++;
          if(restrict_steps && num_steps<=0) more_work=false;
          if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
          walltime = get_rtc() - walltime_0;
          if(restrict_time && walltime>=max_time) more_work=false;
          if(restrict_time) std::cout<<"      "<<max_time-walltime<<" seconds remaining\n";
        }
        else
        {
          // send an exit message to this instance of LSMS
          int r_group=(status.MPI_SOURCE-align)/comm_size;

          MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 2, MPI_COMM_WORLD);
        }

        if(step_out_flag && accepted)
        {
          step_out_file<<"# iteration "<<energies_accumulated<<std::endl;
          step_out_file.precision(15);
          step_out_file<<energies_accumulated<<std::endl;
          step_out_file<<r_values[0]<<"  "<<r_values[1]<<std::endl;
          for(int j=0; j<3*size_lsms; j+=3)
          {
            step_out_file<<r_values[j+R_VALUE_OFFSET]<<"  "<<r_values[j+R_VALUE_OFFSET+1]
                         <<"  "<<r_values[j+R_VALUE_OFFSET+2]<<std::endl;
          }
        }
        // write restart file every restartWriteFrequency seconds
        if(walltime>nextWriteTime)
        {
          generator->writeState("WLrestart.jsn");
          nextWriteTime+=restartWriteFrequency;
        }

      }
      generator->writeState("WLrestart.jsn");
/*
  if(evec_generation_mode==WangLandau_1d)
  (static_cast<WL1dEvecGenerator<std::mt19937> *>(generator))->writeState("WLrestart.state");
  if(evec_generation_mode==ExhaustiveIsing)
  (static_cast<ExhaustiveIsing1dEvecGenerator *>(generator))->writeState("WLrestart.state");
*/
      for(int i=0; i<num_lsms; i++) free(evecs[i]);
      shfree(evecs);
      //shfree(r_values);
    }
  }

  if(world_rank==0)
  {
    if(step_out_flag)
    {
      step_out_file<<"# end\n-1\n"
                   <<energy_accumulator/double(energies_accumulated)<<std::endl;
      step_out_file.close();
    }
    std::cout<<"Finished all scheduled calculations. Freeing resources.\n";
    std::cout<<"Energy mean = "<<energy_accumulator/double(energies_accumulated)<<"Ry\n";
  }


  if(num_lsms>1)
  {
    // make sure averyone arrives here:
    MPI_Bcast(stupid,37,MPI_CHAR,0,MPI_COMM_WORLD);

    if(world_rank==0)
    {
      MPI_Comm_free(&local_comm);
    }
    else if(my_group>=0)
    {
      MPI_Comm_free(&local_comm);
    }
  }



  if(world_rank==0)
  {
    double walltime = get_rtc() - walltime_0;
    std::cout<<" WL-LSMS finished in "<<walltime<<" seconds.\n";
    std::cout<<" Monte-Carlo steps / walltime = "
             <<double(stepCount)/walltime<<"/sec\n";
  }

#ifdef USE_PAPI
  PAPI_stop_counters(papi_values,hw_counters);
  papi_values[hw_counters  ] = PAPI_get_real_cyc()-papi_real_cyc_0;
  papi_values[hw_counters+1] = PAPI_get_real_usec()-papi_real_usec_0;
  papi_values[hw_counters+2] = PAPI_get_virt_cyc()-papi_virt_cyc_0;
  papi_values[hw_counters+3] = PAPI_get_virt_usec()-papi_virt_usec_0;
  long long accumulated_counters[NUM_PAPI_EVENTS+4];
/*
  for(int i=0; i<hw_counters; i++)
  {
  printline(ttos(papi_event_name[i])+" = "+ttos(papi_values[i]),
  std::cout,parameters.myrankWorld);
  }
  printline("PAPI real cycles : "+ttos(papi_values[hw_counters]),
  std::cout,parameters.myrankWorld);
  printline("PAPI real usecs : "+ttos(papi_values[hw_counters+1]),
  std::cout,parameters.myrankWorld);
  printline("PAPI user cycles : "+ttos(papi_values[hw_counters+2]),
  std::cout,parameters.myrankWorld);
  printline("PAPI user usecs : "+ttos(papi_values[hw_counters+3]),
  std::cout,parameters.myrankWorld);
*/
  
  //MPI_Reduce(papi_values,accumulated_counters,hw_counters+4,
  //           MPI_LONG,MPI_SUM,0,MPI_COMM_WORLD);

  shmem_long_sum_to_all(accumulated_counters, papi_values, hw_counters+4,
      comm.pestart, comm.logPE_stride, comm.size, pWrk_i, pSync2);



  if(world_rank==0)
  {
    for(int i=0; i<hw_counters; i++)
    {
      std::cout<<"Accumulated: "<<(papi_event_name[i])<<" = "<<(accumulated_counters[i])<<"\n";
    }
    std::cout<<"PAPI accumulated real cycles : "<<(accumulated_counters[hw_counters])<<"\n";
    std::cout<<"PAPI accumulated user cycles : "<<(accumulated_counters[hw_counters+2])<<"\n";
    double gflops_papi = ((double)accumulated_counters[1])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gflops_hw_double = ((double)accumulated_counters[2])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gflops_hw_single = ((double)accumulated_counters[3])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gips = ((double)accumulated_counters[0])/(1000.0*(double)papi_values[hw_counters+1]);
    std::cout<<"PAPI_FP_OPS real GFLOP/s : "<<(gflops_papi)<<"\n";
    std::cout<<"PAPI hw double real GFLOP/s : "<<(gflops_hw_double)<<"\n";
    std::cout<<"PAPI hw single real GFLOP/s : "<<(gflops_hw_single)<<"\n";
    std::cout<<"PAPI real GINST/s : "<<(gips)<<"\n";
  }
#endif


  //MPI_Finalize();
  return 0;
}
Пример #26
0
int run_nothing(void *_p, unsigned long long *data, int *data_len) {
  struct elim_params *p = (struct elim_params *)_p;

  mzd_t *A = mzd_init(p->m, p->n);

  if(p->r != 0) {
    mzd_t *L, *U;
    L = mzd_init(p->m, p->m);
    U = mzd_init(p->m, p->n);
    mzd_randomize(U);
    mzd_randomize(L);
    for (rci_t i = 0; i < p->m; ++i) {

      for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) {
        int const length = MIN(m4ri_radix, p->m - j);
        mzd_clear_bits(L, i, j, length);
      }
      mzd_write_bit(L,i,i, 1);

      for (rci_t j = 0; j < i && j <p->n; j+=m4ri_radix) {
        int const length = MIN(m4ri_radix, i - j);
        mzd_clear_bits(U, i, j, length);
      }
      if(i < p->r) {
        mzd_write_bit(U, i, i, 1);
      } else {
        for (rci_t j = i; j < p->n; j+=m4ri_radix) {
          int const length = MIN(m4ri_radix, p->n - j);
          mzd_clear_bits(U, i, j, length);
        }
      }
    }
    mzd_mul(A,L,U,0);
    mzd_free(L);
    mzd_free(U);
  } else {
    mzd_randomize(A);
  }

#ifndef HAVE_LIBPAPI
  *data_len = 2;
#else
  *data_len = MIN(papi_array_len + 1, *data_len);
#endif
  int papi_res;

#ifndef HAVE_LIBPAPI
  data[0] = walltime(0);
  data[1] = cpucycles();
#else
  int array_len = *data_len - 1;
  unsigned long long t0 = PAPI_get_virt_usec();
  papi_res = PAPI_start_counters((int*)papi_events, array_len);
  if(papi_res)
    m4ri_die("");
#endif

#ifndef HAVE_LIBPAPI
  data[1] = cpucycles() - data[1];
  data[0] = walltime(data[0]);
#else
  PAPI_stop_counters((long long*)&data[1], array_len);
  t0 = PAPI_get_virt_usec() - t0;
  data[0] = t0;
  for (int nv = 0; nv <= array_len; ++nv) {
    if (data[nv] < loop_calibration[nv])
      loop_calibration[nv] = data[nv];
  }
#endif

  mzd_free(A);

  return (0);
}
Пример #27
0
void* Thread(void *userData) {

    ThreadInfo *info = (ThreadInfo*) userData;
    Context *c = info->c;

    int index = info->index;
    int threadCount = c->threadCount;
    int64_t repetitionCount = c->repetitionCount;

    uint64_t me = 0x1 << index;
    uint64_t full = 0x0000000000000000;

    uint64_t copy; //thread local copy of the entry/exit barrier

    for (int i = 0; i < threadCount; ++i) {
        full |= 0x1 << i;
    }

    // set thread affinity
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    CPU_SET(index, &cpuset);
    assert(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) == 0);

    //DEBUG
    //pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
    //printf("%i uses cpus: ", index);
    //for (int i = 0; i < threadCount; ++i) {
    //    if (CPU_ISSET(i, &cpuset)) {
    //        printf("%i, ", i);
    //    }
    //}
    //printf("\n");


    int threadToBeRecorded = -1;
    int papiEvents[3] = {0x8000003b, 0x80000000, 0x80000002};
    long long papiStart[3] = {0, 0, 0};
    long long papiEnd[3] = {0, 0, 0};

    if (index == threadToBeRecorded) {
        int ret = PAPI_start_counters(papiEvents, 3);
        if (ret != 0) {
            printf("thread %i: PAPI_start_counters %i\n", index, ret);
            assert(0);
        }
        ret = PAPI_read_counters(papiStart, 3);
        if (ret != 0) {
            printf("thread %i: PAPI_read_counters %i\n", index, ret);
            assert(0);
        }
    }


    //unlink("a");
    //FILE *log = fopen("a", "a");

    for(int64_t repetition = 0; repetition < repetitionCount; repetition++){

        if (c->left == 0) { /* *** if () { UNIFIED ENTRY *********************/

            /* run to wall and wait busily */
            do {
                copy = c->entry;
                //fprintf(log, "%i r %lli\n", prime, (long long) copy);
                //fflush(log);
                if ((copy & me) == 0) {
                    copy |= me;
                    c->entry = copy;
                    //fprintf(log, "%i w %lli\n", prime, (long long) copy);
                    //fflush(log);
                }
            }while (copy != full && c->left == 0);

            c->left = 1;

            c->exit = 0x0000000000000000;

        } else if (c->left == 1) { /* *** } else if () { UNIFIED ENTRY *******/

            for (int i = 0; i < threadCount - 1; ++i) {
                if (c->successfulBarrierVisitsCount[i] != c->successfulBarrierVisitsCount[i+1]) {
                    printf("thread %i and %i are not equal at %lli %lli\n", i, i+1,
                            (long long)c->successfulBarrierVisitsCount[i],
                            (long long)c->successfulBarrierVisitsCount[i+1]);
                    ++c->outOfSyncCount;
                    assert(0);
                }
            }

            /* wait busily until everyone has left the barrier */
            do {
                copy = c->exit;
                if ((copy & me) == 0) {
                    copy |= me;
                    c->exit = copy;
                }
            }while (copy != full && c->left == 1);

            c->left = 0;

            c->entry = 0x0000000000000000;

            ++(c->successfulBarrierVisitsCount[index]);

        } /* *** } UNIFIED ENTRY *********************************************/
    }

    if (index == threadToBeRecorded) {
        int ret = PAPI_stop_counters(papiEnd, 3);
        if (ret != 0) {
            printf("%i: PAPI_stop_counters %i\n", index, ret);
            assert(0);
        }
        printf("thread %i: papi counter 0: %lli - %lli = %lli\n", index, papiEnd[0], papiStart[0], papiEnd[0] - papiStart[0]);
        printf("thread %i: papi counter 1: %lli - %lli = %lli\n", index, papiEnd[1], papiStart[1], papiEnd[1] - papiStart[1]);
        printf("thread %i: papi counter 2: %lli - %lli = %lli\n", index, papiEnd[2], papiStart[2], papiEnd[2] - papiStart[2]);
        printf("\n");
    }

    return NULL;
}
Пример #28
0
int run(void *_p, unsigned long long *data, int *data_len) {
  struct elim_params *p = (struct elim_params *)_p;
#ifndef HAVE_LIBPAPI
  *data_len = 2;
#else
  *data_len = MIN(papi_array_len + 1, *data_len);
#endif
  int papi_res;

  mzd_t *A = mzd_init(p->m, p->n);

  if(p->r != 0) {
    mzd_t *L, *U;
    L = mzd_init(p->m, p->m);
    U = mzd_init(p->m, p->n);
    mzd_randomize(U);
    mzd_randomize(L);
    for (rci_t i = 0; i < p->m; ++i) {

      for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) {
        int const length = MIN(m4ri_radix, p->m - j);
        mzd_clear_bits(L, i, j, length);
      }
      mzd_write_bit(L,i,i, 1);

      for (rci_t j = 0; j < i && j < p->n; j+=m4ri_radix) {
        int const length = MIN(m4ri_radix, i - j);
        mzd_clear_bits(U, i, j, length);
      }
      if(i < p->r) {
        mzd_write_bit(U, i, i, 1);
      } else {
        for (rci_t j = i; j < p->n; j+=m4ri_radix) {
          int const length = MIN(m4ri_radix, p->n - i);
          mzd_clear_bits(U, i, j, length);
        }
      }
    }
    mzd_mul(A,L,U,0);
    mzd_free(L);
    mzd_free(U);
  } else {
    mzd_randomize(A);
  }

  mzp_t *P = mzp_init(A->nrows);
  mzp_t *Q = mzp_init(A->ncols);

#ifndef HAVE_LIBPAPI
  data[0] = walltime(0);
  data[1] = cpucycles();
#else
  int array_len = *data_len - 1;
  unsigned long long t0 = PAPI_get_virt_usec();
  papi_res = PAPI_start_counters((int*)papi_events, array_len);
  if (papi_res)
    m4ri_die("");
#endif
  if(strcmp(p->algorithm, "m4ri") == 0)
    p->r = mzd_echelonize_m4ri(A, 0, 0);
  else if(strcmp(p->algorithm, "ple") == 0)
    p->r = mzd_ple(A, P, Q, 0);
  else if(strcmp(p->algorithm, "mmpf") == 0)
    p->r = _mzd_ple_russian(A, P, Q, 0);
  else
    m4ri_die("unknown algorithm %s",p->algorithm);
#ifndef HAVE_LIBPAPI
  data[1] = cpucycles() - data[1];
  data[0] = walltime(data[0]);
#else
  mzp_free(P);
  mzp_free(Q);

  PAPI_stop_counters((long long*)&data[1], array_len);
  t0 = PAPI_get_virt_usec() - t0;
  data[0] = t0;
  for (int nv = 0; nv <= array_len; ++nv) {
    data[nv] -= loop_calibration[nv];
  }
#endif
  mzd_free(A);
  return 0;
}
Пример #29
0
int main()
{
 
    int retval;
    int i,j;
    int EventSet = PAPI_NULL;
    long long totales[EVENT_COUNT], totalesPerm[EVENT_COUNT];
    
    int events[] = {PAPI_L1_DCM, PAPI_L1_DCH, PAPI_L1_DCA, PAPI_L2_DCH, PAPI_L2_DCA};
    long long values[EVENT_COUNT];
    
    
    // Inicializamos la librería PAPI
    retval = PAPI_library_init(PAPI_VER_CURRENT);
    
    if(retval!=PAPI_VER_CURRENT){
	fprintf(stderr, "PAPI library init error!\n");
	exit(1);
    }
    
    
    //Comprobamos si los contadores están disponibles
    for(i=0; i<EVENT_COUNT; i++)
    {
      if (PAPI_OK != PAPI_query_event(events[i])) 
      {
	printf("Cannot count counter %d", i);
	exit(0);
      }
    }

    //iniciamos los vectores de resultados totales
    for(i=0; i<EVENT_COUNT; i++)
    {
      totales[i]=0;
      totalesPerm[i]=0;
    }
    
    
    //iniciarMatrizB();

    printf("\n --------  Prueba con bucle original ---------\n\n");
  
    for(i=0; i<TEST_NUM; i++)
    {
	//Iniciamos la cuenta de eventos
	if (PAPI_start_counters(events, EVENT_COUNT) != PAPI_OK)
	{
	    fprintf(stderr, "ERROR Starting counters!\n");
	    exit(1);
	}
	
	bucle();
	
	//Leemos el valor de un contador:
	if (PAPI_stop_counters(values, EVENT_COUNT) != PAPI_OK)
	{
	    fprintf(stderr, "ERROR Reading counters!\n");
	    exit(1);
	}
	
	for(j=0; j<EVENT_COUNT; j++)
	{
	  totales[j]+=values[j];      
	}

	printf("Prueba %d:\n\tL1 -> Accesos: %lld  Aciertos: %lld  Fallos: %lld\n", i, values[2], values[1], values[0]);
	printf("\tL2 -> Accesos: %lld  Aciertos: %lld\n",  values[4], values[3]);
    }
    
    //Calculamos los valores medios:
    for(i=0; i<EVENT_COUNT; i++)
    {
      totales[i] = totales[i]/TEST_NUM;
    }
    
    printf("\nValores medios:\n");
    printf("\tCaché L1:\n\t\tAccesos: %lld  \n\t\tAciertos: %lld  \n\t\tFallos: %lld \n\t\tPorcentaje de acierto: %lld\n",  
	   totales[2], totales[1], totales[0], 100*totales[1]/totales[2]);
    printf("\tCaché L2:\n\t\tAccesos: %lld  \n\t\tAciertos: %lld  \n\t\tPorcentaje de acierto: %lld\n",
	   totales[4], totales[3], 100*totales[3]/(totales[4]));
    
    
    
    
    printf("\n --------  Prueba con bucle permutado --------- \n\n");

    for(i=0; i<TEST_NUM; i++)
    {
      
	//Iniciamos la cuenta de eventos
	if (PAPI_start_counters(events, EVENT_COUNT) != PAPI_OK)
	{
	    fprintf(stderr, "ERROR Starting counters!\n");
	    exit(1);
	}
	
	
	buclePermutado();
	
	//Leemos el valor de un contador:
	if (PAPI_stop_counters(values, EVENT_COUNT) != PAPI_OK)
	{
	    fprintf(stderr, "ERROR Reading counters!\n");
	    exit(1);
	}
      
      	for(j=0; j<EVENT_COUNT; j++)
	{
	  totalesPerm[j]+=values[j];      
	}
	
	printf("Prueba %d:\n\tL1 -> Accesos: %lld  Aciertos: %lld  Fallos: %lld\n", i, values[2], values[1], values[0]);
	printf("\tL2 -> Accesos: %lld  Aciertos: %lld\n",  values[4], values[3]);

    }

    //Calculamos los valores medios:
    for(i=0; i<EVENT_COUNT; i++)
    {
      totalesPerm[i] = totalesPerm[i]/TEST_NUM;
    
    }
    

    
    printf("\nValores medios:\n");
    printf("\tCaché L1:\n\t\tAccesos: %lld  \n\t\tAciertos: %lld  \n\t\tFallos: %lld \n\t\tPorcentaje de acierto: %lld\n",  
	   totalesPerm[2], totalesPerm[1], totalesPerm[0], 100*totalesPerm[1]/totalesPerm[2]);
    printf("\tCaché L2:\n\t\tAccesos: %lld  \n\t\tAciertos: %lld  \n\t\tPorcentaje de acierto: %lld\n",
	   totalesPerm[4], totalesPerm[3], 100*totalesPerm[3]/(+totalesPerm[4]));
    
    
    return 0;
    
}
Пример #30
0
int main(int argc, char **argv) {

  int    	size,rank, left, right, you, ndata=127,ndata_max=127,seed;
  int           rv;
  long long int i,j,k;
  unsigned long long int  nflop=0,nmem=1,nsleep=0,nrep=1, myflops;
  char 		*env_ptr;
  double 	*sbuf, *rbuf,*x;
  MPI_Status    *s;
  MPI_Request   *r;
  time_t	ts;


#ifdef HPM

   if((rv = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT )
   {
      fprintf(stderr, "Error: %d %s\n",rv, errstring);
      exit(1);
   }

   if ((num_hwcntrs = PAPI_num_counters()) < PAPI_OK)
   {
      printf("There are no counters available. \n");
      exit(1);
   }

  if ( (rv = PAPI_start_counters(events, 2)) != PAPI_OK) {
    fprintf(stdout, "ERROR PAPI_start_counters rv=%d\n", rv);
    exit(rv);
   }

#endif
   seed = time(&ts);

   flags |= DOMPI;
   while(--argc && argv++) {
  if(!strcmp("-v",*argv)) {
    flags |= DOVERBOSE;
  } else if(!strcmp("-n",*argv)) {
    --argc; argv++;
    nflop = atol(*argv);
  } else if(!strcmp("-N",*argv)) {
    --argc; argv++;
    nrep = atol(*argv);
  } else if(!strcmp("-d",*argv)) {
    --argc; argv++;
    ndata_max = ndata = atol(*argv);
  } else if(!strcmp("-m",*argv)) {
    --argc; argv++;
    nmem = atol(*argv);
  } else if(!strcmp("-s",*argv)) {
    --argc; argv++;
    nsleep = atol(*argv);
  } else if(!strcmp("-spray",*argv)) {
    flags |= DOSPRAY;
  } else if(!strcmp("-c",*argv)) {
    flags |= CORE;
  } else if(!strcmp("-r",*argv)) {
    flags |= REGION;
  } else if(!strcmp("-stair",*argv)) {
    flags |= STAIR_RANK;
  } else if(!strcmp("-stair_region",*argv)) {
    flags |= STAIR_REGION;
  } else if(!strcmp("-nompi",*argv)) {
    flags &= ~DOMPI;
  }
 }
 
 if(flags & DOMPI) {
  MPI_Init(&argc,&argv);

/*
  MPI_Init(&argc,&argv);
*/
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 }
 

 if(nmem) {
 nmem = (nmem*1024*1024/sizeof(double));
 x = (double *)malloc((size_t)(nmem*sizeof(double)));
 for(j=0;j<nrep;j++) {
 for(i=0;i<nmem;i++) {
  x[i] = i;
 }
 for(i=0;i<nmem;i++) {
  x[i] = i*x[i];
 }
 if(x[nmem-1]*x[nmem-1] < 0) {
  printf("trickster\n");
 }
 }
 if(0) free((char *)x);
}
 
#ifdef IPM
  if(flags & REGION && rank > -1 ) MPI_Pcontrol(1,"region_zzzzzzzzzzzZz"); 
#endif
 if(nflop) {
  x = (double *)malloc((size_t)(10*sizeof(double)));
  j = k = 0;
  for(i=0;i<10;i++) {
   x[i] = 1.0;
  }
if(flags & STAIR_RANK) { 
 myflops = (rank*nflop)/size;
} else {
 myflops = nflop;
}
  for(i=0;i<nflop;i++) {
   x[j] = x[j]*x[k];
   j = ((i%9)?(j+1):(0));
   k = ((i%8)?(k+1):(0));
  }
  free((char *)x);
 }

 if(nsleep) {
  sleep(nsleep);
 }
#ifdef IPM
  if(flags & REGION && rank > -1 ) MPI_Pcontrol(-1,"region_zzzzzzzzzzzZz"); 
#endif
 
 if(nmem<nflop) nmem=nflop;
 
 if(nflop>1) printf("FLOPS = %lld BYTES = %lld\n", nflop, nmem);
 
 fflush(stdout);
 
 if(flags & CORE) {
  for(i=0;;i++) {
   x[i] = x[i*i-1000];
  }
 }



  env_ptr = getenv("IPM_SOCKET");
  if(env_ptr) {
   printf("IPM: %d IPM_SOCKET in app %s\n", rank, env_ptr);
  }
  
 if(flags & DOMPI) {
  s = (MPI_Status *)malloc((size_t)(sizeof(MPI_Status)*2*size));
  r = (MPI_Request *)malloc((size_t)(sizeof(MPI_Request)*2*size));


  sbuf = (double *)malloc((size_t)(ndata_max*sizeof(double)));
  rbuf = (double *)malloc((size_t)(ndata_max*sizeof(double)));
  for(i=0;i<ndata_max;i++) { sbuf[i] = rbuf[i] = i; }

  MPI_Bcast(&seed,1,MPI_INT,0,MPI_COMM_WORLD);
  srand48(seed);

  for(i=0;i<nrep;i++) {
   MPI_Bcast(sbuf,ndata_max,MPI_DOUBLE,0,MPI_COMM_WORLD);
  }

  if(size>1) {
  if(!rank) {left=size-1;} else { left = rank-1;}
  if(rank == size-1) { right=0;} else {right=rank+1;}
  you =  (rank < size/2)?(rank+size/2):(rank-size/2);


  for(i=0;i<nrep;i++) {
   if(flags & DOSPRAY) {
    ndata = (long int)(drand48()*ndata_max)+1;
   }
   MPI_Sendrecv(sbuf,ndata,MPI_DOUBLE,right,1,rbuf,ndata,MPI_DOUBLE,left,1,MPI_COMM_WORLD,s);
   MPI_Sendrecv(sbuf,ndata,MPI_DOUBLE,left,1,rbuf,ndata,MPI_DOUBLE,right,1,MPI_COMM_WORLD,s);
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(1,"region_a"); 
#endif
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Sendrecv(sbuf,ndata,MPI_DOUBLE,left,1,rbuf,ndata,MPI_DOUBLE,right,1,MPI_COMM_WORLD,s);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);

  MPI_Isend(sbuf,ndata,MPI_DOUBLE,you,0,MPI_COMM_WORLD, r);
  MPI_Recv(rbuf,ndata,MPI_DOUBLE,MPI_ANY_SOURCE,0,MPI_COMM_WORLD, s);
  MPI_Wait(r,s);

  MPI_Irecv(rbuf,ndata,MPI_DOUBLE,MPI_ANY_SOURCE,0,MPI_COMM_WORLD,r);
  MPI_Send(sbuf,ndata,MPI_DOUBLE,you,0,MPI_COMM_WORLD);
  MPI_Wait(r,s);

  
  for(j=0;j<size;j++) {
   MPI_Isend(sbuf+j%ndata_max,1,MPI_DOUBLE,j,4,MPI_COMM_WORLD, r+j);
   MPI_Irecv(rbuf+j%ndata_max,1,MPI_DOUBLE,j,4,MPI_COMM_WORLD,r+size+j);
  }
  MPI_Waitall(2*size,r,s);
/*
  for(j=0;j<size;j++) {
   printf("rep %d stat %d %d %d\n",i, j, s[j].MPI_SOURCE, s[j+size].MPI_SOURCE);
  }
*/

#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(-1,"region_a"); 
#endif

#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(1,"region_b"); 
#endif
  MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(-1,"region_b"); 
#endif

 if(1) {
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(1,"region_c"); 
#endif
  MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(-1,"region_c"); 
#endif
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(1,"region_d"); 
#endif
  MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(-1,"region_d"); 
#endif
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(1,"region_e"); 
#endif
  MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(-1,"region_e"); 
#endif
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(1,"region_f"); 
#endif
  MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(-1,"region_f"); 
#endif
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(1,"region_g"); 
#endif
  MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(-1,"region_g"); 
#endif
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(1,"region_h"); 
#endif
  MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(-1,"region_h"); 
#endif
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(1,"region_i"); 
#endif
  MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0,  MPI_COMM_WORLD);
  MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1,  MPI_COMM_WORLD);
#ifdef IPM
  if(flags & REGION) MPI_Pcontrol(-1,"region_i"); 
#endif
 }

  }
  }


  MPI_Barrier(MPI_COMM_WORLD);

  MPI_Finalize();
  }

#ifdef HPM
   if ((rv=PAPI_stop_counters(values, 2)) != PAPI_OK) {
    fprintf(stdout, "ERROR PAPI_stop_counters rv=%d\n", rv);
    exit(rv);
   }
   printf("PAPI: total instruction/cycles  %lld/%lld %.3e \n", values[0], values[1], values[0]/(values[1]*1.0) );
#endif 

  return 0;   
}