void *Thread(void *arg)
{
   int retval, num_tests = 1;
   int EventSet1=PAPI_NULL;
   int mask1, papi_event;
   int num_events1;
   long long **values;
   long long elapsed_us, elapsed_cyc;
   char event_name[PAPI_MAX_STR_LEN];

   /* add PAPI_TOT_CYC and one of the events in PAPI_FP_INS, PAPI_FP_OPS or
      PAPI_TOT_INS, depends on the availability of the event on the 
      platform */
   EventSet1 = add_two_nonderived_events(&num_events1, &papi_event, hw_info, &mask1);

   expected[EventSet1] = *(int *)arg / mythreshold;
   myid[EventSet1] = PAPI_thread_id();

   values = allocate_test_space(num_tests, num_events1);

   elapsed_us = PAPI_get_real_usec();

   elapsed_cyc = PAPI_get_real_cyc();

   if ((retval = PAPI_overflow(EventSet1, papi_event, mythreshold, 0, handler))
                 != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_overflow", retval);

   /* start_timer(1); */
   if ((retval = PAPI_start(EventSet1)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_start", retval);

   do_stuff();

   if ((retval = PAPI_stop(EventSet1, values[0])) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_stop", retval);

   elapsed_us = PAPI_get_real_usec() - elapsed_us;

   elapsed_cyc = PAPI_get_real_cyc() - elapsed_cyc;

   if ((retval = PAPI_overflow(EventSet1, papi_event, 0, 0, NULL)) != PAPI_OK)
         test_fail(__FILE__, __LINE__, "PAPI_overflow", retval);

   remove_test_events(&EventSet1, mask1);

   if ((retval = PAPI_event_code_to_name(papi_event, event_name)) != PAPI_OK)
         test_fail(__FILE__, __LINE__, "PAPI_event_code_to_name", retval);

   if (!TESTS_QUIET) {
      printf("Thread 0x%x %s : \t%lld\n", (int) pthread_self(), 
                     event_name, (values[0])[0]);
      printf("Thread 0x%x PAPI_TOT_CYC: \t%lld\n", (int) pthread_self(), (values[0])[1]);
      printf("Thread 0x%x Real usec   : \t%lld\n", (int) pthread_self(), elapsed_us);
      printf("Thread 0x%x Real cycles : \t%lld\n", (int) pthread_self(), elapsed_cyc);
   }
   free_test_space(values, num_tests);
   pthread_exit(NULL);
   return (NULL);
}
int main(int argc, char **argv)
{
   int retval;
   long long elapsed_us, elapsed_cyc;
   const PAPI_hw_info_t *hw_info;

   tests_quiet(argc, argv);     /* Set TESTS_QUIET variable */

   retval = PAPI_library_init(PAPI_VER_CURRENT);
   if (retval != PAPI_VER_CURRENT)
      test_fail(__FILE__, __LINE__, "PAPI_library_init", retval);

   hw_info = PAPI_get_hardware_info();
   if (hw_info == NULL)
     test_fail(__FILE__, __LINE__, "PAPI_get_hardware_info", 2);

   elapsed_us = PAPI_get_real_usec();

   elapsed_cyc = PAPI_get_real_cyc();

   printf("Testing real time clock. (CLOCK %d MHz, CPU %f MHz)\n",hw_info->clock_mhz,hw_info->mhz);
   printf("Sleeping for 10 seconds.\n");

   sleep(10);

   elapsed_us = PAPI_get_real_usec() - elapsed_us;

   elapsed_cyc = PAPI_get_real_cyc() - elapsed_cyc;

   printf("%lld us. %lld cyc.\n",elapsed_us,elapsed_cyc);
   printf("%f Computed MHz.\n",(float)elapsed_cyc/(float)elapsed_us);

/* Elapsed microseconds and elapsed cycles are not as unambiguous as they appear.
   On Pentium III and 4, for example, cycles is a measured value, while useconds 
   is computed from cycles and mhz. MHz is read from /proc/cpuinfo (on linux).
   Thus, any error in MHz is propagated to useconds.
   Conversely, on ultrasparc useconds are extracted from a system call (gethrtime())
   and cycles are computed from useconds. Also, MHz comes from a scan of system info,
   Thus any error in gethrtime() propagates to both cycles and useconds, and cycles
   can be further impacted by errors in reported MHz.
   Without knowing the error bars on these system values, we can't really specify
   error ranges for our reported values, but we *DO* know that errors for at least
   one instance of Pentium 4 (torc17@utk) are on the order of one part per thousand.
   Newer multicore Intel processors seem to have broken the relationship between the
   clock rate reported in /proc/cpuinfo and the actual computed clock. To accomodate
   this artifact, the test no longer fails, but merely reports results out of range.
*/

   if (elapsed_us < 9000000)
	   printf("NOTE: Elapsed real time less than 9 seconds!\n");
   if (elapsed_us > 11000000)
     printf("NOTE: Elapsed real time greater than 11 seconds!\n");
   if ((float)elapsed_cyc < 9.0 * hw_info->mhz * 1000000.0) 
     printf("NOTE: Elapsed real cycles less than 9*MHz*1000000.0!\n");
   if ((float)elapsed_cyc > 11.0 * hw_info->mhz * 1000000.0) 
     printf("NOTE: Elapsed real cycles greater than 11*MHz*1000000.0!\n");
   
   test_pass(__FILE__, NULL, 0);
   exit(1);
}
Beispiel #3
0
 main(){

	long_long start_cycles, end_cycles, start_usec, end_usec;
	int EventSet = PAPI_NULL;

	int tabla[100];

	        
	if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT)
	  exit(1);

	/* Gets the starting time in clock cycles */
	start_cycles = PAPI_get_real_cyc();

	/* Gets the starting time in microseconds */
	start_usec = PAPI_get_real_usec();

	/*Create an EventSet */
	//if (PAPI_create_eventset(&EventSet) != PAPI_OK)
	//  exit(1);
	tabla[0]=1;
	tabla[100]=1;
	/* Gets the ending time in clock cycles */
	end_cycles = PAPI_get_real_cyc();

	/* Gets the ending time in microseconds */
	end_usec = PAPI_get_real_usec();

	printf("Wall clock cycles: %lld\n", end_cycles - start_cycles);
	printf("Wall clock time in microseconds: %lld\n", end_usec - start_usec); 
}
int main(int argc, char **argv)
{
   int i, retval;
   long long elapsed_us, elapsed_cyc;

   tests_quiet(argc, argv);     /* Set TESTS_QUIET variable */

   retval = PAPI_library_init(PAPI_VER_CURRENT);
   if (retval != PAPI_VER_CURRENT)
      test_fail(__FILE__, __LINE__, "PAPI_library_init", retval);

   hw_info = PAPI_get_hardware_info();
   if (hw_info == NULL)
     test_fail(__FILE__, __LINE__, "PAPI_get_hardware_info", 2);

   elapsed_us = PAPI_get_real_usec();

   elapsed_cyc = PAPI_get_real_cyc();

#if defined(_AIX)
   retval = PAPI_thread_init((unsigned long (*)(void)) (pthread_self));
   if (retval != PAPI_OK) {
      if (retval == PAPI_ESBSTR)
         test_skip(__FILE__, __LINE__, "PAPI_thread_init", retval);
      else
         test_fail(__FILE__, __LINE__, "PAPI_thread_init", retval);
   }
#pragma ibm parallel_loop
#elif defined(sgi) && defined(mips)
   retval = PAPI_thread_init((unsigned long (*)(void)) (mp_my_threadnum));
   if (retval != PAPI_OK) {
      test_fail(__FILE__, __LINE__, "PAPI_thread_init", retval);
   }
#pragma parallel
#pragma local(i)
#pragma pfor
#elif defined(sun) && defined(sparc)
   retval = PAPI_thread_init((unsigned long (*)(void)) (thr_self));
   if (retval != PAPI_OK) {
      test_fail(__FILE__, __LINE__, "PAPI_thread_init", retval);
   }
#pragma MP taskloop private(i)
#else
#error "Architecture not included in this test file yet."
#endif
   for (i = 1; i < 3; i++)
      Thread(i, 10000000 * i);

   elapsed_cyc = PAPI_get_real_cyc() - elapsed_cyc;

   elapsed_us = PAPI_get_real_usec() - elapsed_us;

   if (!TESTS_QUIET) {
      printf("Master real usec   : \t%lld\n", elapsed_us);
      printf("Master real cycles : \t%lld\n", elapsed_cyc);
   }
   test_pass(__FILE__, NULL, 0);
   exit(1);
}
Beispiel #5
0
void
Thread( int t, int n )
{
	int retval, num_tests = 1;
	int EventSet1 = PAPI_NULL;
	int PAPI_event, mask1;
	int num_events1;
	long long **values;
	long long elapsed_us, elapsed_cyc;
	char event_name[PAPI_MAX_STR_LEN];

	/* add PAPI_TOT_CYC and one of the events in PAPI_FP_INS, PAPI_FP_OPS or
	   PAPI_TOT_INS, depending on the availability of the event on the
	   platform */
	EventSet1 = add_two_events( &num_events1, &PAPI_event, &mask1 );

	retval = PAPI_event_code_to_name( PAPI_event, event_name );
	if ( retval != PAPI_OK )
		test_fail( __FILE__, __LINE__, "PAPI_event_code_to_name", retval );

	values = allocate_test_space( num_tests, num_events1 );

	retval = PAPI_start( EventSet1 );
	if ( retval != PAPI_OK )
		test_fail( __FILE__, __LINE__, "PAPI_start", retval );

	elapsed_us = PAPI_get_real_usec(  );

	elapsed_cyc = PAPI_get_real_cyc(  );

	do_flops( n );

	elapsed_us = PAPI_get_real_usec(  ) - elapsed_us;

	elapsed_cyc = PAPI_get_real_cyc(  ) - elapsed_cyc;

	retval = PAPI_stop( EventSet1, values[0] );
	if ( retval != PAPI_OK )
		test_fail( __FILE__, __LINE__, "PAPI_stop", retval );

	remove_test_events( &EventSet1, mask1 );

	if ( !TESTS_QUIET ) {
		printf( "Thread %#x %-12s : \t%lld\n", t, event_name,
				values[0][1] );
		printf( "Thread %#x PAPI_TOT_CYC : \t%lld\n", t, 
			values[0][0] );
	}

	free_test_space( values, num_tests );
	if ( !TESTS_QUIET ) {
		printf( "Thread %#x Real usec    : \t%lld\n", t, elapsed_us );
		printf( "Thread %#x Real cycles  : \t%lld\n", t, elapsed_cyc );
	}
	PAPI_unregister_thread(  );
}
Beispiel #6
0
void Thread( int n ){

	int retval, num_tests = 1;
	int EventSet1 = PAPI_NULL;
	int PAPI_event, mask1;
	int num_events1;
	long long **values;
	long long elapsed_us, elapsed_cyc, L1_DCM;
	char event_name[PAPI_MAX_STR_LEN];

	printf( "Thread %#x started\n", omp_get_thread_num(  ) );
	num_events1 = 2;

	EventSet1 = add_two_events( &num_events1, &PAPI_event, &mask1 );

	retval = PAPI_event_code_to_name( PAPI_event, event_name );
	if ( retval != PAPI_OK )
		test_fail( __FILE__, __LINE__, "PAPI_event_code_to_name", retval );

	values = allocate_test_space( num_tests, num_events1 );

	elapsed_us = PAPI_get_real_usec(  );
	
	elapsed_cyc = PAPI_get_real_cyc(  );

	retval = PAPI_start( EventSet1 );

	do_flops( n );

	retval = PAPI_stop( EventSet1, values[0] );

	elapsed_us = PAPI_get_real_usec(  ) - elapsed_us;

	elapsed_cyc = PAPI_get_real_cyc(  ) - elapsed_cyc;

	remove_test_events( &EventSet1, mask1 );

	if ( !TESTS_QUIET ) {
		printf( "Thread %#x %-12s : \t%lld\n", omp_get_thread_num(  ),
			event_name, values[0][1] );
		printf( "Thread %#x PAPI_TOT_CYC: \t%lld\n", omp_get_thread_num(  ),
			values[0][0] );
		printf( "Thread %#x Real usec   : \t%lld\n", omp_get_thread_num(  ),
			elapsed_us );
		printf( "Thread %#x Real cycles : \t%lld\n", omp_get_thread_num(  ),
			elapsed_cyc );
	}

	free_test_space( values, num_tests );

	PAPI_unregister_thread(  );
	printf( "Thread %#x finished\n", omp_get_thread_num(  ) );
}
Beispiel #7
0
void
Thread( int n )
{
	int retval, num_tests = 1, tmp;
	int EventSet1 = PAPI_NULL;
	int mask1 = 0x5;
	int num_events1;
	long long **values;
	long long elapsed_us, elapsed_cyc;

	EventSet1 = add_test_events( &num_events1, &mask1 );

	/* num_events1 is greater than num_events2 so don't worry. */

	values = allocate_test_space( num_tests, num_events1 );

	elapsed_us = PAPI_get_real_usec(  );

	elapsed_cyc = PAPI_get_real_cyc(  );

	retval = PAPI_start( EventSet1 );
	if ( retval >= PAPI_OK )
		exit( 1 );

	do_flops( n );

	retval = PAPI_stop( EventSet1, values[0] );
	if ( retval >= PAPI_OK )
		exit( 1 );

	elapsed_us = PAPI_get_real_usec(  ) - elapsed_us;

	elapsed_cyc = PAPI_get_real_cyc(  ) - elapsed_cyc;

	remove_test_events( &EventSet1, mask1 );

	printf( "Thread %#x PAPI_FP_INS : \t%lld\n", pthread_self(  ),
			( values[0] )[0] );
	printf( "Thread %#x PAPI_TOT_CYC: \t%lld\n", pthread_self(  ),
			( values[0] )[1] );
	printf( "Thread %#x Real usec   : \t%lld\n", pthread_self(  ),
			elapsed_us );
	printf( "Thread %#x Real cycles : \t%lld\n", pthread_self(  ),
			elapsed_cyc );

	free_test_space( values, num_tests );
}
Beispiel #8
0
int
main(  )
{
	int i, rc;
	long long elapsed_us, elapsed_cyc;

	elapsed_us = PAPI_get_real_usec(  );

	elapsed_cyc = PAPI_get_real_cyc(  );

	start_pes( 2 );
	Thread( 1000000 * ( _my_pe(  ) + 1 ) );

	elapsed_cyc = PAPI_get_real_cyc(  ) - elapsed_cyc;

	elapsed_us = PAPI_get_real_usec(  ) - elapsed_us;

	printf( "Master real usec   : \t%lld\n", elapsed_us );
	printf( "Master real cycles : \t%lld\n", elapsed_cyc );

	exit( 0 );
}
Beispiel #9
0
void PAPI_HW_COUNTER_off(int tid, int aid)
{
  int retval;

  #ifdef MEASURE_TIME
    thr_vars[tid]._tmp_time[aid].end=PAPI_get_real_cyc();
  #endif

  #ifdef MEASURE_HW_COUNTER
    retval=PAPI_read(thr_vars[tid].EventSet, thr_vars[tid].values);
    if (retval != PAPI_OK) {
       papi_fail(__FILE__, __LINE__, "PAPI_read()", retval);
    }
  #endif
 
  #ifdef MEASURE_CPI
    thr_vars[tid]._tmp_inst[aid].end=thr_vars[tid].values[thr_vars[tid].papi_idx_inst];
    thr_vars[tid]._tmp_cyc[aid].end=thr_vars[tid].values[thr_vars[tid].papi_idx_cyc];
  #endif
 
  #ifdef MEASURE_MEMACC
    thr_vars[tid]._tmp_load[aid].end=thr_vars[tid].values[thr_vars[tid].papi_idx_load];
    thr_vars[tid]._tmp_store[aid].end=thr_vars[tid].values[thr_vars[tid].papi_idx_store];
  #endif

  #ifdef MEASURE_LLCMISS
    thr_vars[tid]._tmp_llcmiss[aid].end=thr_vars[tid].values[thr_vars[tid].papi_idx_llcmiss];
  #endif

  #ifdef MEASURE_ICACHEMISS
    thr_vars[tid]._tmp_icachemiss[aid].end=thr_vars[tid].values[thr_vars[tid].papi_idx_icachemiss];
  #endif

  #ifdef MEASURE_DCACHEMISS
    thr_vars[tid]._tmp_l1dcm[aid].end=thr_vars[tid].values[thr_vars[tid].papi_idx_l1dcm];
    thr_vars[tid]._tmp_l1dca[aid].end=thr_vars[tid].values[thr_vars[tid].papi_idx_l1dca];
  #endif

  #ifdef MEASURE_ENERGY
    retval=PAPI_read(thr_vars[tid].EnergyEventSet, thr_vars[tid].energy_values);
    if (retval != PAPI_OK) {
       papi_fail(__FILE__, __LINE__, "PAPI_read()", retval);
    }
    int i;
    for(i=0;i<thr_vars[tid].num_energy_events;i++){
      thr_vars[tid]._tmp_energies[i][aid].end=thr_vars[tid].energy_values[i];
    }
  #endif
}
Beispiel #10
0
static void init_timer() {
    static int initialized = 0;
    if (!initialized) {
        int mpi_initialized=0;

#ifdef USING_PAPI
        PAPI_library_init(PAPI_VER_CURRENT);

        init_usec = PAPI_get_real_usec();
        init_cycles = PAPI_get_real_cyc();
#endif

#ifdef USING_CLOCK_GETTIME
        clock_getres(clockid, &ts_res);
        clock_gettime(clockid, &ts_init);
#endif

#ifdef USING_GETTIMEOFDAY
        gettimeofday(&tv_init, NULL);
#endif

        initialized = 1;
    }
}
Beispiel #11
0
int main(int argc, char *argv[])
{
  int size, rank, world_rank, my_group;
  int num_lsms; // number of parallel LSMS instances
  int size_lsms; // number of atoms in a lsms instance
  int num_steps; // number of energy calculations
  int initial_steps; // number of steps before sampling starts
  int stepCount=0; // count the Monte Carlo steps executed
  double max_time; // maximum walltime for this run in seconds
  bool restrict_time = false;       // was the maximum time specified?
  bool restrict_steps = false; // or the max. numer of steps?
  int align; // alignment of lsms_instances
  
  double magnetization;
  double energy_accumulator; // accumulates the enegy to calculate the mean
  int energies_accumulated;


  int new_peid,new_root;
  static int op,flag;
  double *evec,*r_values;
  evec=(double *)shmalloc(sizeof(double)*3*size_lsms);
  r_values=(double *)shmalloc(sizeof(double)*(R_VALUE_OFFSET+3*(size_lsms+1)));




  energy_accumulator=0.0;
  energies_accumulated=0;

  double walltime_0,walltime;

  double restartWriteFrequency=30.0*60.0;
  double nextWriteTime=restartWriteFrequency;

  MPI_Comm local_comm;
  int *lsms_rank0;
  MPI_Status status;

  char prefix[40];
  char i_lsms_name[64];
  char gWL_in_name[64], gWL_out_name[64];
  char mode_name[64];
  char energy_calculation_name[64];
  char stupid[37];

  char step_out_name[64];
  char wl_step_out_name[128];
  char *wl_stepf=NULL;
  bool step_out_flag=false;
  std::ofstream step_out_file;
  typedef enum {Constant, Random, WangLandau_1d, ExhaustiveIsing, WangLandau_2d} EvecGenerationMode;
  typedef enum {MagneticMoment, MagneticMomentZ, MagneticMomentX, MagneticMomentY} SecondDimension;

  EvecGenerationMode evec_generation_mode = Constant;
  SecondDimension second_dimension = MagneticMoment;
  double ev0[3];

  bool return_moments_flag=true; // true-> return all magnetic moments from lsms run at each step.
  bool generator_needs_moment=false;

  typedef enum {OneStepEnergy, MultiStepEnergy, ScfEnergy} EnergyCalculationMode;
  EnergyCalculationMode energyCalculationMode = OneStepEnergy;
  int energyIndex=1; // index for the return value to use for the MC step (0: total energy, 1: band energy)

  ev0[0]=ev0[1]=0.0; ev0[2]=1.0;
  // size has to be align + size_lsms*num_lsms
  align=1;
  num_lsms=1;
  size_lsms=-1;
  my_group=-1;
  num_steps=1;
  initial_steps=0;

  sprintf(i_lsms_name,"i_lsms");
  gWL_in_name[0]=gWL_out_name[0]=0;
  mode_name[0]=0;
  energy_calculation_name[0]=0;

  // check command line arguments
  for(int i=0; i<argc; i++)
  {
    if(!strcmp("-num_lsms",argv[i])) num_lsms=atoi(argv[++i]);
    if(!strcmp("-size_lsms",argv[i])) size_lsms=atoi(argv[++i]);
    if(!strcmp("-align",argv[i])) align=atoi(argv[++i]);
    if(!strcmp("-num_steps",argv[i])) {num_steps=atoi(argv[++i]); restrict_steps=true;}
    if(!strcmp("-initial_steps",argv[i])) initial_steps=atoi(argv[++i]); 
    if(!strcmp("-walltime",argv[i])) {max_time=60.0*atof(argv[++i]); restrict_time=true;}
    if(!strcmp("-i",argv[i])) strncpy(i_lsms_name,argv[++i],64);
    if(!strcmp("-random_dir",argv[i])) {evec_generation_mode = Random;}
    if(!strcmp("-step_out",argv[i]))
    {strncpy(step_out_name,argv[++i],64); step_out_flag=true;
      return_moments_flag=true;}
    if(!strcmp("-wl_out", argv[i])) strncpy(gWL_out_name,argv[++i],64);
    if(!strcmp("-wl_in", argv[i])) strncpy(gWL_in_name,argv[++i],64);
    if(!strcmp("-mode", argv[i])) strncpy(mode_name,argv[++i],64);
    if(!strcmp("-energy_calculation",argv[i])) strncpy(energy_calculation_name,argv[++i],64);
  }

  if(!(restrict_steps || restrict_time)) restrict_steps=true;

  if(mode_name[0]!=0)
  {
    if(!strcmp("constant",mode_name)) evec_generation_mode = Constant;
    if(!strcmp("random",mode_name)) evec_generation_mode = Random;
    if(!strcmp("1d",mode_name)) evec_generation_mode = WangLandau_1d;
    if(!strcmp("ising",mode_name)) evec_generation_mode = ExhaustiveIsing;
    if(!strcmp("2d",mode_name)) evec_generation_mode = WangLandau_2d;
    if(!strcmp("2d-m",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMoment;}
    if(!strcmp("2d-x",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentX;}
    if(!strcmp("2d-y",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentY;}
    if(!strcmp("2d-z",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentZ;}
  }

  if(energy_calculation_name[0]!=0)
  {
    if(energy_calculation_name[0]=='o') { energyCalculationMode = OneStepEnergy; energyIndex=1; }
    if(energy_calculation_name[0]=='m') { energyCalculationMode = MultiStepEnergy; energyIndex=1; }
    if(energy_calculation_name[0]=='s') { energyCalculationMode = ScfEnergy; energyIndex=0; }
  }

#ifdef USE_PAPI
#define NUM_PAPI_EVENTS 4
  int hw_counters = PAPI_num_counters();
  if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS;
  int papi_events[NUM_PAPI_EVENTS]; // = {PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_FP_OPS,PAPI_VEC_INS};
  char *papi_event_name[] = {"PAPI_TOT_INS","PAPI_FP_OPS",
                             "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:OP_TYPE",
                             "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:OP_TYPE"};
  // "RETIRED_INSTRUCTIONS",
  // "RETIRED_MMX_AND_FP_INSTRUCTIONS:PACKED_SSE_AND_SSE2",
  // "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:1",
  // "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:1"
  // get events from names:
  for(int i=0; i<NUM_PAPI_EVENTS; i++)
  {
    if(PAPI_event_name_to_code(papi_event_name[i],&papi_events[i]) != PAPI_OK)
    {
      // printline("Error in obtaining PAPI event code for: "+ttos(papi_event_name[i]),
      //           std::cerr,parameters.myrankWorld);
      // printline("Skipping all following events",
      //           std::cerr,parameters.myrankWorld);
      if(hw_counters>i) hw_counters=i;
    }
  }
  long long papi_values[NUM_PAPI_EVENTS+4];
  // printline("PAPI: "+ttos(hw_counters)+" counters available",std::cout,parameters.myrankWorld);
  if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS;
  long long papi_real_cyc_0 = PAPI_get_real_cyc();
  long long papi_real_usec_0 = PAPI_get_real_usec();
  long long papi_virt_cyc_0 = PAPI_get_virt_cyc();
  long long papi_virt_usec_0 = PAPI_get_virt_usec();
  PAPI_start_counters(papi_events,hw_counters);
#endif


  lsms_rank0=(int *)malloc(sizeof(int)*(num_lsms+1));

  // initialize MPI:
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  world_rank=rank;
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  walltime_0 = get_rtc();

#ifndef SVN_REV
#define SVN_REV "unknown"
#endif

// make sure 'return_moments_flag' is set correctly
  switch(evec_generation_mode)
  {
  case Constant : break;
  case Random : break;
  case WangLandau_1d :
    return_moments_flag = true;
    generator_needs_moment = true;
    break;
  case ExhaustiveIsing : break;
  case WangLandau_2d :
    return_moments_flag = true;
    generator_needs_moment = true;
    break;
  default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1);
  }

  if(rank==0)
  {
    std::cout<<"LSMS_3"<<std::endl;
    std::cout<<" SVN revision "<<SVN_REV<<std::endl<<std::endl;
#ifdef USE_PAPI
    std::cout<<" Using Papi counters"<<std::endl<<std::endl; 
#endif
    std::cout<<" Size of LSMS instances = "<<size_lsms<<" atoms\n";
    std::cout<<" Number of LSMS instances = "<<num_lsms<<std::endl;
    std::cout<<" LSMS Energy calculated using ";
    switch(energyCalculationMode)
    {
    case OneStepEnergy: std::cout<<"oneStepEnergy [frozen potential band energy]"<<std::endl; break;
    case MultiStepEnergy: std::cout<<"multiStepEnergy [frozen potential band energy with converged Fermi energy]"<<std::endl; break;
    case ScfEnergy: std::cout<<"scfEnergy [self-consistent total energy]"<<std::endl; break;
    default: std::cout<<"UNKNOWN ENERGY CALCULATION METHOD"<<std::endl; exit(1);
    }
    if(restrict_steps) std::cout<<" Number of gWL steps = "<<num_steps<<std::endl;
    if(restrict_time) std::cout<<" Maximum walltime = "<<max_time<<"s\n";
    std::cout<<" Processor alignment (process allocation quantization) = "<<align<<std::endl;
    switch(evec_generation_mode)
    {
    case Constant : std::cout<<" Constant moments direction along "
                             <<ev0[0]<<" "<<ev0[1]<<" "<<ev0[2]<<std::endl;
      break;
    case Random : std::cout<<" Random distribution of moments (no Wang-Landau)"<<std::endl;
      break;
    case WangLandau_1d : std::cout<<" Wang-Landau for one continuous variable (energy)"<<std::endl;
//      return_moments_flag = true;
//      generator_needs_moment = true;
      break;
    case ExhaustiveIsing : std::cout<<" Exhaustive Ising sampling"<<std::endl; break;
    case WangLandau_2d : std::cout<<" Wang-Landau for two continuous variable (energy, ";
      switch(second_dimension)
      {
      case MagneticMoment  : std::cout<<"magnitude of magnetization)"; break;
      case MagneticMomentX : std::cout<<"x component of magnetization)"; break;
      case MagneticMomentY : std::cout<<"y component of magnetization)"; break;
      case MagneticMomentZ : std::cout<<"z component of magnetization)"; break;
      }
      std::cout<<std::endl;
//      return_moments_flag = true;
//      generator_needs_moment = true;
      break;
    default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1);
    }
    if(step_out_flag) std::cout<<" Step output written to: "<<step_out_name<<std::endl;
    std::cout<<std::endl;

    if(step_out_flag && (evec_generation_mode==WangLandau_1d))
    {
      // step_out_flag=false;
      snprintf(wl_step_out_name,127,"wl1d_%s",step_out_name);
      wl_stepf=wl_step_out_name;
    }

    if(step_out_flag)
    {
      step_out_file.open(step_out_name);
      step_out_file<<"#";
      for(int i=0; i<argc; i++) step_out_file<<" "<<argv[i];
      step_out_file<<std::endl<<size_lsms<<std::endl;
    }
  }

  if(generator_needs_moment) return_moments_flag=true;

  if(num_lsms==1)
  {
    SHMEM_activeset local_comm;
    local_comm.rank=shmem_my_pe();
    local_comm.size=shmem_n_pes();
    local_comm.start_pe=0;
    local_comm.logPE_stride=0;
    LSMS lsms_calc(local_comm,i_lsms_name,"1_");
      
    if(rank==0)
    {
      std::cout<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n";
      std::cout<<"  LSMS version = "<<lsms_calc.version()<<std::endl;
    }

    if(energyCalculationMode==OneStepEnergy)
      std::cout<<"one step Energy = "<<lsms_calc.oneStepEnergy()<<std::endl;
    else if(energyCalculationMode==MultiStepEnergy)
      std::cout<<"multi-step Energy = "<<lsms_calc.multiStepEnergy()<<std::endl;
    else if(energyCalculationMode==ScfEnergy)
      std::cout<<"self-consistent Energy = "<<lsms_calc.scfEnergy()<<std::endl;
    else
    {
      printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n");
     // MPI_Abort(MPI_COMM_WORLD,5);
      exit(5);
    }
  }
  else
  {
    // build the communicators
    //int color=MPI_UNDEFINED;
    //Assuming user passes a power of two while using "-align"
    int s = align;
    int comm_size=(size-align)/num_lsms;
    int world_rank;
    for(int i=0; i<num_lsms; i++)
    {
      if((world_rank>=s) && (world_rank<s+comm_size)) 
      { 
        my_group=i; 
        //color=i; 
        new_peid=world_rank-s;
        new_root=s;
      }
      lsms_rank0[i]=s;
      s+=comm_size;
    }
    if(world_rank==0){ 
      //color=num_lsms;
      new_peid=0;
      comm_size=1;
      new_root=0;
    }

    //MPI_Comm_split(MPI_COMM_WORLD, color, 0, &local_comm);
    SHMEM_activeset local_comm;
    local_comm.rank=new_peid;
    local_comm.size=comm_size;
    local_comm.start_pe=new_root;
    local_comm.logPE_stride=0;

    std::cout<<"world_rank="<<world_rank<<" -> group="<<my_group<<std::endl;

      
    snprintf(prefix,38,"Group %4d: ",my_group);

    // now we get ready to do some calculations...

    if(my_group>=0)
    {
      double energy;
      double band_energy;
      int static i_values[10];
      double static r_values[10];
      static int op;


      //MPI_Comm_rank(local_comm, &rank);
      rank = local_comm.rank;
      snprintf(prefix,38,"%d_",my_group);
      // to use the ramdisk on jaguarpf:
      // snprintf(prefix,38,"/tmp/ompi/%d_",my_group);
      LSMS lsms_calc(local_comm,i_lsms_name,prefix);
      snprintf(prefix,38,"Group %4d: ",my_group);

      if(rank==0 && my_group==0)
      {
        std::cout<<prefix<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n";
        std::cout<<prefix<<"  LSMS version = "<<lsms_calc.version()<<std::endl;
      }

      // wait for commands from master
      bool finished=false;
      while(!finished)
      {
        if(rank==0)
        {
          //MPI_Recv(evec,3*size_lsms,MPI_DOUBLE,0,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          //op =status.MPI_TAG;
          if (lsms_rank0[0]==world_rank)
                shmem_barrier(0, lsms_rank0[0], 2, pSync1);

        }
        //MPI_Bcast(&op,1,MPI_INT,0,local_comm);
        shmem_broadcast32(&op, &op, 1, local_comm.start_pe, local_comm.start_pe, local_comm.logPE_stride, local_comm.size, pSync2); 

/* recognized opcodes:
   5: calculate energy

   recognized energy calculation modes:
   OneStepEnergy : calclulate frozen potential band energy in one step (don't converge Ef)
   use only if the Fermi energy will not change due to MC steps!
   The only method available in LSMS_1.9
   MultiStepEnergy : calculate frozen potential band energy after converging Fermi energy
   This should be the new default method. If the Fermi energy doesn't change
   multiStepEnergy only performs one step and should be equivalent to oneStepEnergy
   The tolerance for Ef convergence can be set with LSMS::setEfTol(Real).
   The default tolerance is set in the LSMS::LSMS constructor (currently 1.0e-6).
   The maximum number of steps is read from the LSMS input file 'nscf' parameter.
   ScfEnergy : this will calculate the selfconsistent total energy.
   The maximum number of steps is read from the LSMS input file 'nscf' parameter.
   NOT IMPLEMENTED YET!!!

   10: get number of sites
*/

        if(op==5)
        {
          lsms_calc.setEvec(evec);
          if(energyCalculationMode==OneStepEnergy)
            energy=lsms_calc.oneStepEnergy(&band_energy);
          else if(energyCalculationMode==MultiStepEnergy)
            band_energy=energy=lsms_calc.multiStepEnergy();
          else if(energyCalculationMode==ScfEnergy)
            energy=lsms_calc.scfEnergy(&band_energy);
          else
          {
            printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n");
            //MPI_Abort(MPI_COMM_WORLD,5);
            exit(5);
          }
          r_values[0]=energy;
          r_values[1]=band_energy;
          if(return_moments_flag)
          {
            lsms_calc.getMag(&r_values[R_VALUE_OFFSET]);
          }
          if(rank==0)
          {
            if(return_moments_flag)
            {
              //MPI_Send(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,0,1005,MPI_COMM_WORLD);
              shmem_double_put(r_values, r_values, R_VALUE_OFFSET+3*size_lsms, 0);

            } else {
              //MPI_Send(r_values,R_VALUE_OFFSET,MPI_DOUBLE,0,1005,MPI_COMM_WORLD);
              shmem_double_put(r_values, r_values, R_VALUE_OFFSET, 0);
            }
            shmem_fence();
            shmem_int_swap(&flag, world_rank, 0);

          }
              
        } else if(op==10) {
          i_values[0]=lsms_calc.numSpins();
          //MPI_Send(i_values,10,MPI_INT,0,1010,MPI_COMM_WORLD);
          shmem_int_put(i_values, i_values, 10, 0);
        } else {
          // printf("world rank %d: recieved exit\n",world_rank); 
          finished=true;
        }
      }

      shfree(evec);
      //shfree(r_values);
    }
    else if(world_rank==0)
    {
      int running;
      double **evecs;
      //double *r_values;
      //int i_values[10];
      int *init_steps;
      int total_init_steps;
      bool accepted;
        
      char *wl_inf=NULL;
      char *wl_outf=NULL;
      if(gWL_in_name) wl_inf=gWL_in_name;
      if(gWL_out_name) wl_outf=gWL_out_name;
        
      EvecGenerator *generator;

/*
      // get number of spins from first LSMS instance
      // temp r_values:
      r_values=(double *)malloc(sizeof(double)*10);
      MPI_Send(r_values,1,MPI_DOUBLE, lsms_rank0[0], 10, MPI_COMM_WORLD);
      free(r_values);
      MPI_Recv(i_values,10,MPI_INT,lsms_rank0[0],1010,MPI_COMM_WORLD,&status);
      if(i_values[0]!=size_lsms)
      {
        printf("Size specified for Wang-Landau and in LSMS input file don't match!\n");
        size_lsms=i_values[0];
      }
*/

      evecs=(double **)shmalloc(sizeof(double *)*num_lsms);
      init_steps=(int *)shmalloc(sizeof(int)*num_lsms);
      for(int i=0; i<num_lsms; i++)
      {
        evecs[i]=(double *)shmalloc(sizeof(double)*3*size_lsms);
        init_steps[i]=initial_steps;
      }
      total_init_steps=num_lsms*initial_steps;
        

      // Initialize the correct evec generator
      switch(evec_generation_mode)
      {
      case Random :  generator = new RandomEvecGenerator(size_lsms);
        break;
      case Constant: generator = new ConstantEvecGenerator(size_lsms, ev0, num_lsms);
        break;
     //case WangLandau_1d : generator = new WL1dEvecGenerator<std::mt19937>(size_lsms, num_lsms,
     //                                                                      evecs, wl_inf, wl_outf, wl_stepf);
     case WangLandau_1d : generator = new WL1dEvecGenerator<boost::mt19937>(size_lsms, num_lsms,
                                                                           evecs, wl_inf, wl_outf, wl_stepf);
        break;
      case ExhaustiveIsing : generator = new ExhaustiveIsing1dEvecGenerator(size_lsms, num_lsms,
                                                                            evecs, wl_inf, wl_outf);
        break;
      //case WangLandau_2d : generator = new WL2dEvecGenerator<std::mt19937>(size_lsms, num_lsms,
      //                                                                     evecs, wl_inf, wl_outf, wl_stepf);
      case WangLandau_2d : generator = new WL2dEvecGenerator<boost::mt19937>(size_lsms, num_lsms,
                                                                           evecs, wl_inf, wl_outf, wl_stepf);
        break;
      default: std::cerr<<"The code should never arrive here: UNKNOWN EVEC GENERATION MODE\n";
        exit(1);
      }

      for(int i=0; i<num_lsms; i++)
      {
        generator->initializeEvec(i,evecs[i]);
      }
      std::cout<<"This is the master node\n";
      // issue initial commands to all LSMS instances
      running=0;
      bool more_work=true;
      if(total_init_steps>0)
      {
        for(int i=0; i<num_lsms; i++)
        {
          std::cout<<"starting initial calculation in group "<<i<<std::endl;
          //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD);
          shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]);
          shmem_int_p(&op, 5, lsms_rank0[i]);
          shmem_fence();


          num_steps--; running++; stepCount++;
          if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
        }
        shmem_barrier(0, lsms_rank0[0], 2, pSync1);
        // first deal with the initial steps:
        while(running>0)
        {
          //if(return_moments_flag)
          //  MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          //else
          //  MPI_Recv(r_values,R_VALUE_OFFSET,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          
          shmem_int_wait(&flag,-1);

          running--;
          // std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl;
          // std::cout<<"    band energy E_band="<<r_values[1]<<std::endl;
          if(total_init_steps>0)
          {
            //int r_group=(status.MPI_SOURCE-align)/comm_size;
            int r_group=(flag-align)/comm_size;
            std::cout<<"starting additional calculation in group "<<r_group<<std::endl;

            if(init_steps[r_group]>0)
            {
              more_work = !(generator->generateUnsampledEvec(r_group,evecs[r_group],r_values[energyIndex]));
              init_steps[r_group]--; total_init_steps--;
            }
                
            //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD);
            shmem_double_put(r_values, evecs[r_group],  3*size_lsms, lsms_rank0[r_group]); //TODO check this
            shmem_fence();
                
            num_steps--; running++; stepCount++;
            if(restrict_steps && num_steps<=0) more_work=false;
            if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
            walltime = get_rtc() - walltime_0;
            if(restrict_time && walltime>=max_time) more_work=false;
            if(restrict_time) std::cout<<"      "<<max_time-walltime<<" seconds remaining\n";
          }
              
        }
      }
      more_work=true;
      running=0;
      for(int i=0; i<num_lsms; i++)
      {
        std::cout<<"starting main calculation in group "<<i<<std::endl;
        //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD);
        shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]);
        shmem_int_p(&op, 5, lsms_rank0[i]);
        shmem_fence();
        num_steps--; running++; stepCount++;
        if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
      }
      shmem_barrier(0, lsms_rank0[0], 2, pSync1);
        
      generator->startSampling();
      // wait for results and issue new commands or wind down
      while(running>0)
      {
        //MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
        shmem_int_wait(&flag,-1);

        running--;
        std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl;
        std::cout<<"    band energy E_band="<<r_values[1]<<std::endl;
        // printf("from status.MPI_SOURCE=%d\n",status.MPI_SOURCE);
        energy_accumulator+=r_values[0]; energies_accumulated++;
        if(more_work)
        {
          int r_group=(status.MPI_SOURCE-align)/comm_size;
          std::cout<<"starting additional calculation in group "<<r_group<<std::endl;
              
          if(generator_needs_moment)
          {
            double m0,m1,m2;
            m0=0.0; m1=0.0; m2=0.0;
            for(int i=0; i<3*size_lsms; i+=3)
            {
              m0+=r_values[R_VALUE_OFFSET+i];
              m1+=r_values[R_VALUE_OFFSET+i+1];
              m2+=r_values[R_VALUE_OFFSET+i+2];
            }
            switch(second_dimension)
            {
            case  MagneticMoment : magnetization=std::sqrt(m0*m0+m1*m1+m2*m2); break;
            case  MagneticMomentX : magnetization=m0; break;
            case  MagneticMomentY : magnetization=m1; break;
            case  MagneticMomentZ : magnetization=m2; break;
            }
            if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex],magnetization, &accepted))
              more_work=false;
          } else {
            if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex], &accepted)) more_work=false;
          }

          //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD);
          shmem_double_put(r_values, evecs[r_group],  3*size_lsms, lsms_rank0[r_group]); //TODO check this
          shmem_fence();

          num_steps--; running++; stepCount++;
          if(restrict_steps && num_steps<=0) more_work=false;
          if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
          walltime = get_rtc() - walltime_0;
          if(restrict_time && walltime>=max_time) more_work=false;
          if(restrict_time) std::cout<<"      "<<max_time-walltime<<" seconds remaining\n";
        }
        else
        {
          // send an exit message to this instance of LSMS
          int r_group=(status.MPI_SOURCE-align)/comm_size;

          MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 2, MPI_COMM_WORLD);
        }

        if(step_out_flag && accepted)
        {
          step_out_file<<"# iteration "<<energies_accumulated<<std::endl;
          step_out_file.precision(15);
          step_out_file<<energies_accumulated<<std::endl;
          step_out_file<<r_values[0]<<"  "<<r_values[1]<<std::endl;
          for(int j=0; j<3*size_lsms; j+=3)
          {
            step_out_file<<r_values[j+R_VALUE_OFFSET]<<"  "<<r_values[j+R_VALUE_OFFSET+1]
                         <<"  "<<r_values[j+R_VALUE_OFFSET+2]<<std::endl;
          }
        }
        // write restart file every restartWriteFrequency seconds
        if(walltime>nextWriteTime)
        {
          generator->writeState("WLrestart.jsn");
          nextWriteTime+=restartWriteFrequency;
        }

      }
      generator->writeState("WLrestart.jsn");
/*
  if(evec_generation_mode==WangLandau_1d)
  (static_cast<WL1dEvecGenerator<std::mt19937> *>(generator))->writeState("WLrestart.state");
  if(evec_generation_mode==ExhaustiveIsing)
  (static_cast<ExhaustiveIsing1dEvecGenerator *>(generator))->writeState("WLrestart.state");
*/
      for(int i=0; i<num_lsms; i++) free(evecs[i]);
      shfree(evecs);
      //shfree(r_values);
    }
  }

  if(world_rank==0)
  {
    if(step_out_flag)
    {
      step_out_file<<"# end\n-1\n"
                   <<energy_accumulator/double(energies_accumulated)<<std::endl;
      step_out_file.close();
    }
    std::cout<<"Finished all scheduled calculations. Freeing resources.\n";
    std::cout<<"Energy mean = "<<energy_accumulator/double(energies_accumulated)<<"Ry\n";
  }


  if(num_lsms>1)
  {
    // make sure averyone arrives here:
    MPI_Bcast(stupid,37,MPI_CHAR,0,MPI_COMM_WORLD);

    if(world_rank==0)
    {
      MPI_Comm_free(&local_comm);
    }
    else if(my_group>=0)
    {
      MPI_Comm_free(&local_comm);
    }
  }



  if(world_rank==0)
  {
    double walltime = get_rtc() - walltime_0;
    std::cout<<" WL-LSMS finished in "<<walltime<<" seconds.\n";
    std::cout<<" Monte-Carlo steps / walltime = "
             <<double(stepCount)/walltime<<"/sec\n";
  }

#ifdef USE_PAPI
  PAPI_stop_counters(papi_values,hw_counters);
  papi_values[hw_counters  ] = PAPI_get_real_cyc()-papi_real_cyc_0;
  papi_values[hw_counters+1] = PAPI_get_real_usec()-papi_real_usec_0;
  papi_values[hw_counters+2] = PAPI_get_virt_cyc()-papi_virt_cyc_0;
  papi_values[hw_counters+3] = PAPI_get_virt_usec()-papi_virt_usec_0;
  long long accumulated_counters[NUM_PAPI_EVENTS+4];
/*
  for(int i=0; i<hw_counters; i++)
  {
  printline(ttos(papi_event_name[i])+" = "+ttos(papi_values[i]),
  std::cout,parameters.myrankWorld);
  }
  printline("PAPI real cycles : "+ttos(papi_values[hw_counters]),
  std::cout,parameters.myrankWorld);
  printline("PAPI real usecs : "+ttos(papi_values[hw_counters+1]),
  std::cout,parameters.myrankWorld);
  printline("PAPI user cycles : "+ttos(papi_values[hw_counters+2]),
  std::cout,parameters.myrankWorld);
  printline("PAPI user usecs : "+ttos(papi_values[hw_counters+3]),
  std::cout,parameters.myrankWorld);
*/
  
  //MPI_Reduce(papi_values,accumulated_counters,hw_counters+4,
  //           MPI_LONG,MPI_SUM,0,MPI_COMM_WORLD);

  shmem_long_sum_to_all(accumulated_counters, papi_values, hw_counters+4,
      comm.pestart, comm.logPE_stride, comm.size, pWrk_i, pSync2);



  if(world_rank==0)
  {
    for(int i=0; i<hw_counters; i++)
    {
      std::cout<<"Accumulated: "<<(papi_event_name[i])<<" = "<<(accumulated_counters[i])<<"\n";
    }
    std::cout<<"PAPI accumulated real cycles : "<<(accumulated_counters[hw_counters])<<"\n";
    std::cout<<"PAPI accumulated user cycles : "<<(accumulated_counters[hw_counters+2])<<"\n";
    double gflops_papi = ((double)accumulated_counters[1])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gflops_hw_double = ((double)accumulated_counters[2])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gflops_hw_single = ((double)accumulated_counters[3])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gips = ((double)accumulated_counters[0])/(1000.0*(double)papi_values[hw_counters+1]);
    std::cout<<"PAPI_FP_OPS real GFLOP/s : "<<(gflops_papi)<<"\n";
    std::cout<<"PAPI hw double real GFLOP/s : "<<(gflops_hw_double)<<"\n";
    std::cout<<"PAPI hw single real GFLOP/s : "<<(gflops_hw_single)<<"\n";
    std::cout<<"PAPI real GINST/s : "<<(gips)<<"\n";
  }
#endif


  //MPI_Finalize();
  return 0;
}
Beispiel #12
0
int
main( int argc, char **argv )
{
	pthread_t id[NUM_THREADS];
	int flops[NUM_THREADS];
	int i, rc, retval;
	pthread_attr_t attr;
	long long elapsed_us, elapsed_cyc;
	const PAPI_exe_info_t *prginfo = NULL;

	tests_quiet( argc, argv );	/* Set TESTS_QUIET variable */

	if ( ( retval =
		   PAPI_library_init( PAPI_VER_CURRENT ) ) != PAPI_VER_CURRENT )
		test_fail( __FILE__, __LINE__, "PAPI_library_init", retval );
	if ( ( retval =
		   PAPI_thread_init( ( unsigned
							   long ( * )( void ) ) ( pthread_self ) ) ) !=
		 PAPI_OK ) {
		if ( retval == PAPI_ECMP )
			test_skip( __FILE__, __LINE__, "PAPI_thread_init", retval );
		else
			test_fail( __FILE__, __LINE__, "PAPI_thread_init", retval );
	}
	if ( ( prginfo = PAPI_get_executable_info(  ) ) == NULL ) {
		retval = 1;
		test_fail( __FILE__, __LINE__, "PAPI_get_executable_info", retval );
	}
	my_start = prginfo->address_info.text_start;
	my_end = prginfo->address_info.text_end;
	length = ( unsigned int ) ( my_end - my_start );

	elapsed_us = PAPI_get_real_usec(  );

	elapsed_cyc = PAPI_get_real_cyc(  );

	pthread_attr_init( &attr );
#ifdef PTHREAD_CREATE_UNDETACHED
	pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_UNDETACHED );
#endif
#ifdef PTHREAD_SCOPE_SYSTEM
	retval = pthread_attr_setscope( &attr, PTHREAD_SCOPE_SYSTEM );
	if ( retval != 0 )
		test_skip( __FILE__, __LINE__, "pthread_attr_setscope", retval );
#endif

	for ( i = 0; i < NUM_THREADS; i++ ) {
		flops[i] = FLOPS * ( i + 1 );
		rc = pthread_create( &id[i], &attr, Thread, ( void * ) &flops[i] );
		if ( rc )
			return ( FAILURE );
	}
	for ( i = 0; i < NUM_THREADS; i++ )
		pthread_join( id[i], NULL );

	pthread_attr_destroy( &attr );

	elapsed_cyc = PAPI_get_real_cyc(  ) - elapsed_cyc;

	elapsed_us = PAPI_get_real_usec(  ) - elapsed_us;

	if ( !TESTS_QUIET ) {
		printf( "Master real usec   : \t%lld\n", elapsed_us );
		printf( "Master real cycles : \t%lld\n", elapsed_cyc );
	}

	test_pass( __FILE__, NULL, 0 );
	pthread_exit( NULL );
	exit( 1 );
}
Beispiel #13
0
int
main( int argc, char **argv )
{
	int status, retval, num_tests = 2, tmp;
	int EventSet1 = PAPI_NULL, EventSet2 = PAPI_NULL;
	int PAPI_event, PAPI_event2, mask1, mask2;
	int num_events1, num_events2;
	long long **values;
	long long elapsed_us, elapsed_cyc, elapsed_virt_us, elapsed_virt_cyc;
	char event_name[PAPI_MAX_STR_LEN], add_event_str[PAPI_MAX_STR_LEN];
	const PAPI_component_info_t *cmpinfo;
	pid_t pid, pid2;
	double ratio1,ratio2;

	/* Set TESTS_QUIET variable */
	tests_quiet( argc, argv );

	/* Initialize the library */
	retval = PAPI_library_init( PAPI_VER_CURRENT );
	if ( retval != PAPI_VER_CURRENT ) {
	   test_fail_exit( __FILE__, __LINE__, "PAPI_library_init", retval );
	}

	/* get the component info and check if we support attach */
	if ( ( cmpinfo = PAPI_get_component_info( 0 ) ) == NULL ) {
	   test_fail_exit( __FILE__, __LINE__, "PAPI_get_component_info", 0 );
	}

	if ( cmpinfo->attach == 0 ) {
	   test_skip( __FILE__, __LINE__, 
		      "Platform does not support attaching", 0 );
	}

	/* fork off first child */
	pid = fork(  );
	if ( pid < 0 ) {
	   test_fail_exit( __FILE__, __LINE__, "fork()", PAPI_ESYS );
	}
	if ( pid == 0 ) {
	   exit( wait_for_attach_and_loop( 1 ) );
	}

	/* fork off second child, does twice as much */
	pid2 = fork(  );
	if ( pid2 < 0 ) {
	   test_fail_exit( __FILE__, __LINE__, "fork()", PAPI_ESYS );
	}
	if ( pid2 == 0 ) {
	   exit( wait_for_attach_and_loop( 2 ) );
	}

	/* add PAPI_TOT_CYC and one of the events in 
           PAPI_FP_INS, PAPI_FP_OPS or PAPI_TOT_INS, 
           depending on the availability of the event 
           on the platform                            */
	EventSet1 = add_two_events( &num_events1, &PAPI_event, &mask1 );
	EventSet2 = add_two_events( &num_events2, &PAPI_event2, &mask2 );

	if ( cmpinfo->attach_must_ptrace ) {
	   if ( ptrace( PTRACE_ATTACH, pid, NULL, NULL ) == -1 ) {
	      perror( "ptrace(PTRACE_ATTACH)" );
	      return 1 ;
	   }
	   if ( waitpid( pid, &status, 0 ) == -1 ) {
	      perror( "waitpid()" );
	      exit( 1 );
	   }
	   if ( WIFSTOPPED( status ) == 0 ) {
	      test_fail( __FILE__, __LINE__,
			"Child process didnt return true to WIFSTOPPED", 0 );
	   }
	   
	   if ( ptrace( PTRACE_ATTACH, pid2, NULL, NULL ) == -1 ) {
	      perror( "ptrace(PTRACE_ATTACH)" );
	      return 1;
	   }
	   if ( waitpid( pid2, &status, 0 ) == -1 ) {
	      perror( "waitpid()" );
	      exit( 1 );
	   }
	   if ( WIFSTOPPED( status ) == 0 ) {
 	      test_fail( __FILE__, __LINE__,
			"Child process didnt return true to WIFSTOPPED", 0 );
	   }
	}

	retval = PAPI_attach( EventSet1, ( unsigned long ) pid );
	if ( retval != PAPI_OK ) {
	   test_fail( __FILE__, __LINE__, "PAPI_attach", retval ); 
	}

	retval = PAPI_attach( EventSet2, ( unsigned long ) pid2 );
	if ( retval != PAPI_OK ) {
	   test_fail( __FILE__, __LINE__, "PAPI_attach", retval ); 
	}

	retval = PAPI_event_code_to_name( PAPI_event, event_name );
	if ( retval != PAPI_OK ) {
	   test_fail( __FILE__, __LINE__, "PAPI_event_code_to_name", retval );
	}
	sprintf( add_event_str, "PAPI_add_event[%s]", event_name );

	/* num_events1 is greater than num_events2 so don't worry. */

	values = allocate_test_space( num_tests, num_events1 );

	/* Gather before values */
	elapsed_us = PAPI_get_real_usec(  );
	elapsed_cyc = PAPI_get_real_cyc(  );
	elapsed_virt_us = PAPI_get_virt_usec(  );
	elapsed_virt_cyc = PAPI_get_virt_cyc(  );

	/* Wait for the SIGSTOP. */
	if ( cmpinfo->attach_must_ptrace ) {
	   if ( ptrace( PTRACE_CONT, pid, NULL, NULL ) == -1 ) {
	      perror( "ptrace(PTRACE_CONT)" );
	      return 1;
	   }
	   if ( waitpid( pid, &status, 0 ) == -1 ) {
	      perror( "waitpid()" );
	      exit( 1 );
	   }
	   if ( WIFSTOPPED( status ) == 0 ) {
	      test_fail( __FILE__, __LINE__,
			"Child process didn't return true to WIFSTOPPED", 0 );
	   }
	   if ( WSTOPSIG( status ) != SIGSTOP ) {
	      test_fail( __FILE__, __LINE__,
			"Child process didn't stop on SIGSTOP", 0 );
	   }

	   if ( ptrace( PTRACE_CONT, pid2, NULL, NULL ) == -1 ) {
	      perror( "ptrace(PTRACE_CONT)" );
	      return 1;
	   }
	   if ( waitpid( pid2, &status, 0 ) == -1 ) {
	      perror( "waitpid()" );
	      exit( 1 );
	   }
	   if ( WIFSTOPPED( status ) == 0 ) {
	      test_fail( __FILE__, __LINE__,
			"Child process didn't return true to WIFSTOPPED", 0 );
	   }
	   if ( WSTOPSIG( status ) != SIGSTOP ) {
	      test_fail( __FILE__, __LINE__,
			"Child process didn't stop on SIGSTOP", 0 );
	   }
	}

	/* start first child */
	retval = PAPI_start( EventSet1 );
	if ( retval != PAPI_OK ) {
		test_fail( __FILE__, __LINE__, "PAPI_start", retval );
	}

	/* start second child */
	retval = PAPI_start( EventSet2 );
	if ( retval != PAPI_OK ) {
		test_fail( __FILE__, __LINE__, "PAPI_start", retval );
	}

	/* Wait for the SIGSTOP. */
	if ( cmpinfo->attach_must_ptrace ) {
	   if ( ptrace( PTRACE_CONT, pid, NULL, NULL ) == -1 ) {
	      perror( "ptrace(PTRACE_ATTACH)" );
	      return 1;
	   }
	   if ( waitpid( pid, &status, 0 ) == -1 ) {
	      perror( "waitpid()" );
	      exit( 1 );
	   }
	   if ( WIFSTOPPED( status ) == 0 ) {
	      test_fail( __FILE__, __LINE__,
			"Child process didn't return true to WIFSTOPPED", 0 );
	   }
	   if ( WSTOPSIG( status ) != SIGSTOP ) {
	      test_fail( __FILE__, __LINE__,
			"Child process didn't stop on SIGSTOP", 0 );
	   }

	   if ( ptrace( PTRACE_CONT, pid2, NULL, NULL ) == -1 ) {
	       perror( "ptrace(PTRACE_ATTACH)" );
	       return 1;
	   }
	   if ( waitpid( pid2, &status, 0 ) == -1 ) {
	      perror( "waitpid()" );
	      exit( 1 );
	   }
	   if ( WIFSTOPPED( status ) == 0 ) {
	      test_fail( __FILE__, __LINE__,
			"Child process didn't return true to WIFSTOPPED", 0 );
	   }
	   if ( WSTOPSIG( status ) != SIGSTOP ) {
	      test_fail( __FILE__, __LINE__,
			"Child process didn't stop on SIGSTOP", 0 );
	   }
	}

	elapsed_virt_us = PAPI_get_virt_usec(  ) - elapsed_virt_us;
	elapsed_virt_cyc = PAPI_get_virt_cyc(  ) - elapsed_virt_cyc;
	elapsed_us = PAPI_get_real_usec(  ) - elapsed_us;
	elapsed_cyc = PAPI_get_real_cyc(  ) - elapsed_cyc;

	/* stop first child */
	retval = PAPI_stop( EventSet1, values[0] );
	if ( retval != PAPI_OK ) {
	   printf( "Warning: PAPI_stop returned error %d, probably ok.\n",
				retval );
	}

	/* stop second child */
	retval = PAPI_stop( EventSet2, values[1] );
	if ( retval != PAPI_OK ) {
	   printf( "Warning: PAPI_stop returned error %d, probably ok.\n",
				retval );
	}

	remove_test_events( &EventSet1, mask1 );
	remove_test_events( &EventSet2, mask2 );

	if ( cmpinfo->attach_must_ptrace ) {
	   if ( ptrace( PTRACE_CONT, pid, NULL, NULL ) == -1 ) {
	      perror( "ptrace(PTRACE_CONT)" );
	      return 1;
	   }
	   if ( ptrace( PTRACE_CONT, pid2, NULL, NULL ) == -1 ) {
	      perror( "ptrace(PTRACE_CONT)" );
	      return 1;
	   }
	}

	if ( waitpid( pid, &status, 0 ) == -1 ) {
	   perror( "waitpid()" );
	   exit( 1 );
	}
	if ( WIFEXITED( status ) == 0 ) {
	   test_fail( __FILE__, __LINE__,
		     "Child process didn't return true to WIFEXITED", 0 );
	}

	if ( waitpid( pid2, &status, 0 ) == -1 ) {
	   perror( "waitpid()" );
	   exit( 1 );
	}
	if ( WIFEXITED( status ) == 0 ) {
		test_fail( __FILE__, __LINE__,
			  "Child process didn't return true to WIFEXITED", 0 );
	}

	/* This code isn't necessary as we know the child has exited, */
	/* it *may* return an error if the component so chooses. You  */
        /* should use read() instead. */

	printf( "Test case: multiple 3rd party attach start, stop.\n" );
	printf( "-----------------------------------------------\n" );
	tmp = PAPI_get_opt( PAPI_DEFDOM, NULL );
	printf( "Default domain is: %d (%s)\n", tmp, 
		stringify_all_domains( tmp ) );
	tmp = PAPI_get_opt( PAPI_DEFGRN, NULL );
	printf( "Default granularity is: %d (%s)\n", tmp,
			stringify_granularity( tmp ) );
	printf( "Using %d iterations of c += a*b\n", NUM_FLOPS );
	printf( "-------------------------------------------------------------------------\n" );

	sprintf( add_event_str, "(PID %jd) %-12s : \t", ( intmax_t ) pid,
			 event_name );
	printf( TAB1, add_event_str, values[0][1] );
	sprintf( add_event_str, "(PID %jd) PAPI_TOT_CYC : \t", 
		 ( intmax_t ) pid );
	printf( TAB1, add_event_str, values[0][0] );
	sprintf( add_event_str, "(PID %jd) %-12s : \t", ( intmax_t ) pid2,
			 event_name );
	printf( TAB1, add_event_str,values[1][1] );
	sprintf( add_event_str, "(PID %jd) PAPI_TOT_CYC : \t", 
		 ( intmax_t ) pid2 );
	printf( TAB1, add_event_str, values[1][0] );
	printf( TAB1, "Real usec    : \t", elapsed_us );
	printf( TAB1, "Real cycles  : \t", elapsed_cyc );
	printf( TAB1, "Virt usec    : \t", elapsed_virt_us );
	printf( TAB1, "Virt cycles  : \t", elapsed_virt_cyc );

	printf
		( "-------------------------------------------------------------------------\n" );

	printf("Verification: pid %d results should be twice pid %d\n",pid2,pid );

	ratio1=(double)values[1][0]/(double)values[0][0];
	ratio2=(double)values[1][1]/(double)values[0][1];

	printf("\t%lld/%lld = %lf\n",values[1][0],values[0][0],ratio1);
	

	if ((ratio1 >2.15 ) || (ratio1 < 1.85)) {
	  printf("Ratio out of range, should be ~2.0 not %lf\n",ratio1);
	  test_fail( __FILE__, __LINE__,
		    "Error: Counter ratio not two", 0 );
	}

	printf("\t%lld/%lld = %lf\n",values[1][1],values[0][1],ratio2);

	if ((ratio2 >2.75 ) || (ratio2 < 1.25)) {
	  printf("Ratio out of range, should be ~2.0, not %lf\n",ratio2);
	  test_fail( __FILE__, __LINE__,
		    "Known issue: Counter ratio not two", 0 );
	}

	test_pass( __FILE__, values, num_tests );
	return 0;
}
Beispiel #14
0
int
main( int argc, char **argv )
{
    int status, retval, num_tests = 1, tmp;
    int EventSet1 = PAPI_NULL;
    long long **values;
    long long elapsed_us, elapsed_cyc, elapsed_virt_us, elapsed_virt_cyc;
    char event_name[PAPI_MAX_STR_LEN];;
    const PAPI_hw_info_t *hw_info;
    const PAPI_component_info_t *cmpinfo;
    pid_t pid;

    /* Fork before doing anything with the PMU */

    setbuf(stdout,NULL);
    pid = fork(  );
    if ( pid < 0 )
        test_fail( __FILE__, __LINE__, "fork()", PAPI_ESYS );
    if ( pid == 0 )
        exit( wait_for_attach_and_loop(  ) );

    tests_quiet( argc, argv );	/* Set TESTS_QUIET variable */


    /* Master only process below here */

    retval = PAPI_library_init( PAPI_VER_CURRENT );
    if ( retval != PAPI_VER_CURRENT )
        test_fail_exit( __FILE__, __LINE__, "PAPI_library_init", retval );

    if ( ( cmpinfo = PAPI_get_component_info( 0 ) ) == NULL )
        test_fail_exit( __FILE__, __LINE__, "PAPI_get_component_info", 0 );

    if ( cmpinfo->attach == 0 )
        test_skip( __FILE__, __LINE__, "Platform does not support attaching",
                   0 );

    hw_info = PAPI_get_hardware_info(  );
    if ( hw_info == NULL )
        test_fail_exit( __FILE__, __LINE__, "PAPI_get_hardware_info", 0 );

    /* add PAPI_TOT_CYC and one of the events in PAPI_FP_INS, PAPI_FP_OPS or
       PAPI_TOT_INS, depending on the availability of the event on the
       platform */
    retval = PAPI_create_eventset(&EventSet1);
    if ( retval != PAPI_OK )
        test_fail_exit( __FILE__, __LINE__, "PAPI_attach", retval );

    /* Force addition of component */

    retval = PAPI_assign_eventset_component( EventSet1, 0 );
    if ( retval != PAPI_OK )
        test_fail( __FILE__, __LINE__, "PAPI_assign_eventset_component",
                   retval );

    /* The following call causes this test to fail for perf_events */

    retval = PAPI_attach( EventSet1, ( unsigned long ) pid );
    if ( retval != PAPI_OK )
        test_fail_exit( __FILE__, __LINE__, "PAPI_attach", retval );

    sprintf(event_name,"PAPI_TOT_CYC");

    retval = PAPI_add_event(EventSet1, PAPI_TOT_CYC);
    if ( retval != PAPI_OK )
        test_fail_exit( __FILE__, __LINE__, "PAPI_add_event", retval );
    retval = PAPI_add_event(EventSet1, PAPI_FP_INS);
    if ( retval == PAPI_ENOEVNT ) {
        test_warn( __FILE__, __LINE__, "PAPI_FP_INS", retval);
    } else if ( retval != PAPI_OK ) {
        test_fail_exit( __FILE__, __LINE__, "PAPI_add_event", retval );
    }

    values = allocate_test_space( 1, 2);

    elapsed_us = PAPI_get_real_usec(  );

    elapsed_cyc = PAPI_get_real_cyc(  );

    elapsed_virt_us = PAPI_get_virt_usec(  );

    elapsed_virt_cyc = PAPI_get_virt_cyc(  );

    printf("must_ptrace is %d\n",cmpinfo->attach_must_ptrace);
    pid_t  child = wait( &status );
    printf( "Debugger exited wait() with %d\n",child );
    if (WIFSTOPPED( status ))
    {
        printf( "Child has stopped due to signal %d (%s)\n",
                WSTOPSIG( status ), strsignal(WSTOPSIG( status )) );
    }
    if (WIFSIGNALED( status ))
    {
        printf( "Child %ld received signal %d (%s)\n",
                (long)child,
                WTERMSIG(status) , strsignal(WTERMSIG( status )) );
    }
    printf("After %d\n",retval);

    retval = PAPI_start( EventSet1 );
    if ( retval != PAPI_OK )
        test_fail_exit( __FILE__, __LINE__, "PAPI_start", retval );

    printf("Continuing\n");
    if ( ptrace( PTRACE_CONT, pid, NULL, NULL ) == -1 ) {
        perror( "ptrace(PTRACE_CONT)" );
        return 1;
    }


    do {
        child = wait( &status );
        printf( "Debugger exited wait() with %d\n", child);
        if (WIFSTOPPED( status ))
        {
            printf( "Child has stopped due to signal %d (%s)\n",
                    WSTOPSIG( status ), strsignal(WSTOPSIG( status )) );
        }
        if (WIFSIGNALED( status ))
        {
            printf( "Child %ld received signal %d (%s)\n",
                    (long)child,
                    WTERMSIG(status) , strsignal(WTERMSIG( status )) );
        }
    } while (!WIFEXITED( status ));

    printf("Child exited with value %d\n",WEXITSTATUS(status));
    if (WEXITSTATUS(status) != 0)
        test_fail_exit( __FILE__, __LINE__, "Exit status of child to attach to", PAPI_EMISC);

    retval = PAPI_stop( EventSet1, values[0] );
    if ( retval != PAPI_OK )
        test_fail_exit( __FILE__, __LINE__, "PAPI_stop", retval );

    elapsed_virt_us = PAPI_get_virt_usec(  ) - elapsed_virt_us;

    elapsed_virt_cyc = PAPI_get_virt_cyc(  ) - elapsed_virt_cyc;

    elapsed_us = PAPI_get_real_usec(  ) - elapsed_us;

    elapsed_cyc = PAPI_get_real_cyc(  ) - elapsed_cyc;

    retval = PAPI_cleanup_eventset(EventSet1);
    if (retval != PAPI_OK)
        test_fail_exit( __FILE__, __LINE__, "PAPI_cleanup_eventset", retval );

    retval = PAPI_destroy_eventset(&EventSet1);
    if (retval != PAPI_OK)
        test_fail_exit( __FILE__, __LINE__, "PAPI_destroy_eventset", retval );

    printf( "Test case: 3rd party attach start, stop.\n" );
    printf( "-----------------------------------------------\n" );
    tmp = PAPI_get_opt( PAPI_DEFDOM, NULL );
    printf( "Default domain is: %d (%s)\n", tmp, stringify_all_domains( tmp ) );
    tmp = PAPI_get_opt( PAPI_DEFGRN, NULL );
    printf( "Default granularity is: %d (%s)\n", tmp,
            stringify_granularity( tmp ) );
    printf( "Using %d iterations of c += a*b\n", NUM_FLOPS );
    printf
    ( "-------------------------------------------------------------------------\n" );

    printf( "Test type    : \t           1\n" );

    printf( TAB1, "PAPI_TOT_CYC : \t", ( values[0] )[0] );
    printf( TAB1, "PAPI_FP_INS  : \t", ( values[0] )[1] );
    printf( TAB1, "Real usec    : \t", elapsed_us );
    printf( TAB1, "Real cycles  : \t", elapsed_cyc );
    printf( TAB1, "Virt usec    : \t", elapsed_virt_us );
    printf( TAB1, "Virt cycles  : \t", elapsed_virt_cyc );

    printf
    ( "-------------------------------------------------------------------------\n" );

    printf( "Verification: none\n" );

    test_pass( __FILE__, values, num_tests );
    exit( 1 );
}
Beispiel #15
0
void PAPI_HW_COUNTER_open(int tid){
    // set events to measure
    int *Events;
    int EventCode;
    int event_ctr = 0;
    int retval;
  #ifdef MEASURE_TIME
  #endif

  #ifdef MEASURE_CPI
    thr_vars[tid].papi_idx_inst = thr_vars[tid].num_events++;
    thr_vars[tid].papi_idx_cyc = thr_vars[tid].num_events++;
  #endif

  #ifdef MEASURE_MEMACC
    thr_vars[tid].papi_idx_load = thr_vars[tid].num_events++;
    thr_vars[tid].papi_idx_store = thr_vars[tid].num_events++;
  #endif
    
  #ifdef MEASURE_LLCMISS
    thr_vars[tid].papi_idx_llcmiss = thr_vars[tid].num_events++;
  #endif

  #ifdef MEASURE_ICACHEMISS
    thr_vars[tid].papi_idx_icachemiss = thr_vars[tid].num_events++;
  #endif

  #ifdef MEASURE_DCACHEMISS
    thr_vars[tid].papi_idx_l1dcm = thr_vars[tid].num_events++;
    thr_vars[tid].papi_idx_l1dca = thr_vars[tid].num_events++;
  #endif 

  #ifdef MEASURE_ENERGY
  #endif
    
    event_ctr = 0;  // reset event counter

    if((Events=(int*)malloc(sizeof(int)*thr_vars[tid].num_events)) == NULL){
        printf("ERROR: Failed to allocate memory for Events.");
    }
    if((thr_vars[tid].values=(long long int*)malloc(sizeof(long long)*thr_vars[tid].num_events)) == NULL){
        printf("ERROR: Failed to allocate memory for Events.");
    }

  #ifdef __ARM_ARCH_7A__
    // pin processor only on arm arch.
    pid_t pid = getpid();
    int core = 0;
    printf("Pinning thread %d to cores %d..%d\n", pid, 0, 0);
    printf("Observe in terminal via \"ps -p <PID> -L -o pid,tid,psr\"\n");
    pin_cpu(pid, core);
    printf("Pinned to core %d\n", core);
  #endif

    // Open file to output
    char filename_id[2*sizeof(int)];
    snprintf(filename_id, sizeof(filename_id),"%d",tid);
    char* filename_w_id;
    filename_w_id=(char*)malloc(strlen(OUTFILEID)+strlen(OUTFILEEXT)+strlen(filename_id)+1);
    strcpy(filename_w_id, OUTFILEID);
    strcat(filename_w_id, filename_id);
    strcat(filename_w_id, OUTFILEEXT);
    
    thr_vars[tid].f=fopen(filename_w_id, "w");
    if (thr_vars[tid].f == NULL){
        printf("failed to open file %s.\n", filename_w_id);
        exit(1);
    }

    // Measure clock frequency
    long long elapsed_cyc;
    elapsed_cyc = PAPI_get_real_cyc();
    sleep(1);
    elapsed_cyc = PAPI_get_real_cyc()-elapsed_cyc;
    thr_vars[tid].PAPI_CLOCK_RATE = elapsed_cyc;
    printf("Measured clock frequency: %.0lld Hz\n",thr_vars[tid].PAPI_CLOCK_RATE);

    // Set EventSet
    thr_vars[tid].EventSet = PAPI_NULL;/*EventSet*/
    retval=PAPI_create_eventset(&(thr_vars[tid].EventSet));
    if (retval != PAPI_OK){
        papi_fail(__FILE__, __LINE__, "PAPI_create_eventset()", retval);
    }


  #ifdef MEASURE_TIME
  #endif

  #ifdef MEASURE_CPI
    retval = PAPI_event_name_to_code( PAPI_INST , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, inst", retval);
    }
    Events[event_ctr++] = EventCode;
   
    retval = PAPI_event_name_to_code( PAPI_CYC , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, cyc", retval);
    }
    Events[event_ctr++] = EventCode;
  #endif

  #ifdef MEASURE_MEMACC
    retval = PAPI_event_name_to_code( PAPI_MEM_LOAD , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, loads", retval);
    }
    Events[event_ctr++] = EventCode;
    
    retval=PAPI_event_name_to_code( PAPI_MEM_STORE , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, stores", retval);
    }
    Events[event_ctr++] = EventCode;
  #endif
  
  #ifdef MEASURE_LLCMISS
    retval = PAPI_event_name_to_code( PAPI_LLC_MISS , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, llc miss", retval);
    }
    Events[event_ctr++] = EventCode;
  #endif

  #ifdef MEASURE_ICACHEMISS
    retval = PAPI_event_name_to_code( PAPI_IC_MISS , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, llc miss", retval);
    }
    Events[event_ctr++] = EventCode;
  #endif 

  #ifdef MEASURE_DCACHEMISS
    retval = PAPI_event_name_to_code( PAPI_L1_DC_MISS , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, Level 1 data cache misses", retval);
    }
    Events[event_ctr++] = EventCode;

    retval = PAPI_event_name_to_code( PAPI_L1_DC_ACCESS, &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, Level 1 data cache accesses", retval);
    }
    Events[event_ctr++] = EventCode;
  #endif

  #ifdef MEASURE_ENERGY
    printf("Probing all RAPL events\n");

    thr_vars[tid].numcmp = PAPI_num_components();

    for(thr_vars[tid].cid=0; thr_vars[tid].cid<thr_vars[tid].numcmp; thr_vars[tid].cid++) {
        if ( (thr_vars[tid].cmpinfo = PAPI_get_component_info(thr_vars[tid].cid)) == NULL) {
            papi_fail(__FILE__, __LINE__,"PAPI_get_component_info failed\n", 0);
        }
        if (strstr(thr_vars[tid].cmpinfo->name,"rapl")) {
            thr_vars[tid].rapl_cid=thr_vars[tid].cid;
            printf("Found rapl component at cid %d.\n",thr_vars[tid].rapl_cid);
            if (thr_vars[tid].cmpinfo->disabled) {
                printf("RAPL component disabled: %s\n",
                        thr_vars[tid].cmpinfo->disabled_reason);
                exit(EXIT_FAILURE);
            }
            break;
        }
    }

    if (thr_vars[tid].cid==thr_vars[tid].numcmp) {
        // Component not found:
        papi_fail(__FILE__,__LINE__,"No rapl component found\n",0);
    }

    retval = PAPI_create_eventset( &(thr_vars[tid].EnergyEventSet) );
    if (retval != PAPI_OK){
        papi_fail(__FILE__,__LINE__, "PAPI_create_eventset()", retval);
    }

  // Add all events:
  int r;
  thr_vars[tid].code = PAPI_NATIVE_MASK;
  r = PAPI_enum_cmp_event( &(thr_vars[tid].code), PAPI_ENUM_FIRST, thr_vars[tid].rapl_cid );
  while ( r == PAPI_OK ) {
     retval = PAPI_event_code_to_name( thr_vars[tid].code, thr_vars[tid].event_names[thr_vars[tid].num_energy_events] );
     if ( retval != PAPI_OK ) {
        printf("Error translating %#x\n",thr_vars[tid].code);
        papi_fail(__FILE__, __LINE__, 
                  "PAPI_event_code_to_name", retval );
     }

     printf("Found event: %s\n", thr_vars[tid].event_names[thr_vars[tid].num_energy_events]);

     retval = PAPI_get_event_info(thr_vars[tid].code,&(thr_vars[tid].evinfo));
     if (retval != PAPI_OK) {
        papi_fail(__FILE__, __LINE__,
                  "Error getting event info\n",retval);
     }
	
     strncpy(thr_vars[tid].units[thr_vars[tid].num_energy_events],thr_vars[tid].evinfo.units,PAPI_MIN_STR_LEN);
     thr_vars[tid].data_type[thr_vars[tid].num_energy_events] = thr_vars[tid].evinfo.data_type;

     retval = PAPI_add_event(thr_vars[tid].EnergyEventSet, thr_vars[tid].code);
     if (retval != PAPI_OK ) {
         papi_fail( __FILE__, __LINE__, "PAPI_add_event()", retval);
     }
  	      
     r = PAPI_enum_cmp_event( &(thr_vars[tid].code), PAPI_ENUM_EVENTS, thr_vars[tid].rapl_cid );
     thr_vars[tid].num_energy_events++;
  }

    if((thr_vars[tid].energy_values=(long long int*)malloc(sizeof(long long)*thr_vars[tid].num_energy_events)) == NULL){
        printf("ERROR: Failed to allocate memory for Events.");
    }

  #endif

  #ifdef MEASURE_HW_COUNTER
    int k;
    for(k = 0; k < thr_vars[tid].num_events; k++){
        retval = PAPI_add_event(thr_vars[tid].EventSet, Events[k]);
        if (retval != PAPI_OK ) {
           printf("At event %d:\n",k); 
           papi_fail( __FILE__, __LINE__, "PAPI_add_event()", retval);
        }
    }

    retval=PAPI_start(thr_vars[tid].EventSet);
    if (retval != PAPI_OK){
        papi_fail(__FILE__, __LINE__, "PAPI_start()", retval);
    }
  #endif

  #ifdef MEASURE_ENERGY
    retval=PAPI_start(thr_vars[tid].EnergyEventSet);
    if (retval != PAPI_OK){
        papi_fail(__FILE__, __LINE__, "PAPI_start() on energy", retval);
    }
  #endif
}
Beispiel #16
0
int main(int argc, char *argv[]) {


	float rtime1, rtime2, ptime1, ptime2, mflops;
	long long flpops;

	unsigned long int tid;
	int num_hwcntrs = 0;
	int fip = 0, retval;
	float real_time, proc_time;
	long long flpins;

	int i;
	unsigned int EventSet = PAPI_NULL; 
    int count = 0, err_count = 0;


    PAPI_event_info_t info;

    long long ( values2[2] )[2];
    long long min, max;
    int PAPI_event, mythreshold = THRESHOLD;
    char event_name1[PAPI_MAX_STR_LEN];
    const PAPI_hw_info_t *hw_info = NULL;
    int num_events, mask;
    int num_flops = NUM_FLOPS;
    long long elapsed_us, elapsed_cyc;



tests_quiet( argc, argv );  /* Set TESTS_QUIET variable */



    retval = PAPI_library_init( PAPI_VER_CURRENT );
    if ( retval != PAPI_VER_CURRENT )
      test_fail( __FILE__, __LINE__, "PAPI_library_init", retval );

  retval = PAPI_create_eventset( &EventSet );
  if ( retval != PAPI_OK )
      test_fail( __FILE__, __LINE__, "PAPI_create_eventset", retval );

	/* Get hardware info */
  hw_info = PAPI_get_hardware_info(  );
  if ( hw_info == NULL )
      test_fail( __FILE__, __LINE__, "PAPI_get_hardware_info", 2 );

  EventSet = 	add_two_nonderived_events( &num_events, &PAPI_event, &mask );

  printf("Using %#x for the overflow event\n",PAPI_event);

  if ( PAPI_event == PAPI_FP_INS ) {
      mythreshold = THRESHOLD;
  }
  else {
		#if defined(linux)
      mythreshold = ( int ) hw_info->cpu_max_mhz * 20000;
		#else
      mythreshold = THRESHOLD * 2;
		#endif
  }

  retval = PAPI_start( EventSet );
  if ( retval != PAPI_OK )
      test_fail( __FILE__, __LINE__, "PAPI_start", retval );

  do_flops( NUM_FLOPS );

	/* stop the calibration run */
  retval = PAPI_stop( EventSet, values2[0] );
  if ( retval != PAPI_OK )
      test_fail( __FILE__, __LINE__, "PAPI_stop", retval );


	/* set up overflow handler */
  retval = PAPI_overflow( EventSet, PAPI_event, mythreshold, 0, handler );
  if ( retval != PAPI_OK ) {
      test_fail( __FILE__, __LINE__, "PAPI_overflow", retval );
  }

	/* Start overflow run */
  retval = PAPI_start( EventSet );
  if ( retval != PAPI_OK ) {
      test_fail( __FILE__, __LINE__, "PAPI_start", retval );
  }

  do_flops( num_flops );

	/* stop overflow run */
  retval = PAPI_stop( EventSet, values2[1] );
  if ( retval != PAPI_OK )
      test_fail( __FILE__, __LINE__, "PAPI_stop", retval );
  retval = PAPI_overflow( EventSet, PAPI_event, 0, 0, handler );
  if ( retval != PAPI_OK )
      test_fail( __FILE__, __LINE__, "PAPI_overflow", retval );

  if ( !TESTS_QUIET ) {
      if ( ( retval =
         PAPI_event_code_to_name( PAPI_event, event_name1 ) ) != PAPI_OK )
         test_fail( __FILE__, __LINE__, "PAPI_event_code_to_name", retval );

     printf( "Test case: Overflow dispatch of 2nd event in set with 2 events.\n" );
     printf( "---------------------------------------------------------------\n" );
     printf( "Threshold for overflow is: %d\n", mythreshold );
     printf( "Using %d iterations\n", num_flops );
     printf( "-----------------------------------------------\n" );

     printf( "Test type    : %16d%16d\n", 1, 2 );
     printf( OUT_FMT, event_name1, ( values2[0] )[1], ( values2[1] )[1] );
     printf( OUT_FMT, "PAPI_TOT_CYC", ( values2[0] )[0], ( values2[1] )[0] );
     printf( "Overflows    : %16s%16d\n", "", total );
     printf( "-----------------------------------------------\n" );
 }

 retval = PAPI_cleanup_eventset( EventSet );
 if ( retval != PAPI_OK )
  test_fail( __FILE__, __LINE__, "PAPI_cleanup_eventset", retval );

retval = PAPI_destroy_eventset( &EventSet );
if ( retval != PAPI_OK )
  test_fail( __FILE__, __LINE__, "PAPI_destroy_eventset", retval );

if ( !TESTS_QUIET ) {
  printf( "Verification:\n" );
#if defined(linux) || defined(__ia64__) || defined(_POWER4)
  num_flops *= 2;
#endif
  if ( PAPI_event == PAPI_FP_INS || PAPI_event == PAPI_FP_OPS ) {
     printf( "Row 1 approximately equals %d %d\n", num_flops, num_flops );
 }
 printf( "Column 1 approximately equals column 2\n" );
 printf( "Row 3 approximately equals %u +- %u %%\n",( unsigned ) ( ( values2[0] )[1] / ( long long ) mythreshold ),( unsigned ) ( OVR_TOLERANCE * 100.0 ) );
}

min =
( long long ) ( ( ( double ) values2[0][1] * ( 1.0 - OVR_TOLERANCE ) ) /
  ( double ) mythreshold );
max =
( long long ) ( ( ( double ) values2[0][1] * ( 1.0 + OVR_TOLERANCE ) ) /
  ( double ) mythreshold );
printf( "Overflows: total(%d) > max(%lld) || total(%d) < min(%lld) \n", total,
  max, total, min );
if ( total > max || total < min )
  test_fail( __FILE__, __LINE__, "Overflows", 1 );



printf("Initial thread id is: %lu\n",tid);

	/* Initialize the PAPI library and get the number of counters available */

if ((num_hwcntrs = PAPI_num_counters()) <= 0)  
  handle_error(1);



  /*  The installation supports PAPI, but has no counters */
if ((num_hwcntrs = PAPI_num_counters()) == 0 )
    fprintf(stderr,"Info:: This machine does not provide hardware counters.");

printf("This system has %d available counters.\n", num_hwcntrs);

if (num_hwcntrs > 2)
  num_hwcntrs = 2;

	 /* Start counting events */




if (PAPI_start_counters(Events, num_hwcntrs) != PAPI_OK)
  handle_error(1);

if (argc != 8) {
  printf("\nError :: Ejecutar como : a.out archivo_BD Num_elem archivo_queries Num_queries N_THREADS numero_K Dimension_objetos\n");
  return 0;
}
TOPK = atoi(argv[6]);
DIM = atoi(argv[7]);
double **DB;
	double **Consultas; //Cola de consultas
	int N_QUERIES, N_DB;
	char str_f[256];
	double dato[DIM];
	int j;
	FILE *f_dist, *fquery;
	Elem *heap, e_temp,*answer;
	int *acum, N_THREADS;


	//N_THREADS es el nro. de threads con el que se lanzará la región paralela
	N_THREADS = atoi(argv[5]);
	//N_QUERIES es el nro. de consultas
	N_QUERIES = atoi(argv[4]);
	N_DB = atoi(argv[2]);

	printf("\nN_QUERIES = %d\nN_THREADS = %d\n", N_QUERIES, N_THREADS);
	fflush(stdout);

	acum = (int *) malloc(sizeof (int)*N_THREADS);
	for (i = 0; i < N_THREADS; i++)
		acum[i] = 0;

	sprintf(str_f, "%s", argv[1]);
	printf("\nAbriendo %s... ", argv[1]);
	fflush(stdout);
	f_dist = fopen(str_f, "r");
	printf("OK\n");
	fflush(stdout);


	Consultas = (double **) malloc(sizeof (double *)*N_QUERIES);
	for (i = 0; i < N_QUERIES; i++)
		Consultas[i] = (double *) malloc(sizeof (double)*DIM);

	DB = (double **) malloc(sizeof (double *)*N_DB);
	for (i = 0; i < N_DB; i++)
		DB[i] = (double *) malloc(sizeof (double)*DIM);

	answer = (Elem *)malloc(sizeof(Elem)*N_QUERIES*TOPK);

	printf("\nCargando DB... ");
	fflush(stdout);
	for (i = 0; i < N_DB; i++) {
		//Usar leedato_cophir() cuando se utilice la BD Cophir para no tener problemas con las ","
		//if (leedato_cophir(dato, f_dist) == ERROR || feof(f_dist))
		if (leedato(dato, f_dist) == ERROR || feof(f_dist)) {
			printf("\n\nERROR :: N_DB mal establecido\n\n");
			fflush(stdout);
			fclose(f_dist);
			break;
		}
		copiavalor(DB[i], dato);
	}
	fclose(f_dist);
	printf("OK\n");
	fflush(stdout);

	if ((fquery = fopen(argv[3], "r")) == NULL)
		printf("Error al abrir para lectura el archivo de qeuries: %s\n", argv[3]);
	else
		printf("Abriendo  para lectura %s\n", argv[3]);
	printf("\nCargando Consultas... ");
	fflush(stdout);
	for (i = 0; i < N_QUERIES; i++) {
		//Usar leedato_cophir() cuando se utilice la BD Cophir para no tener problemas con las ","
		//if (leedato_cophir(dato, fquery) == ERROR || feof(fquery))
		if (leedato(dato, fquery) == ERROR || feof(fquery)) {
			printf("\n\nERROR :: N_QUERIES mal establecido, Menos queries que las indicadas\n\n");
			fflush(stdout);
			fclose(fquery);
			break;
		}
		copiavalor(Consultas[i], dato);
	}
	fclose(fquery);
	printf("OK\n");
	fflush(stdout);

	PAPI_start_counters((int*) Events, NUM_EVENTS);
	omp_set_num_threads(N_THREADS);

	elapsed_us = PAPI_get_real_usec(  );

	elapsed_cyc = PAPI_get_real_cyc(  );

	retval =
	PAPI_thread_init( ( unsigned
		long ( * )( void ) ) ( omp_get_thread_num ) );
	if ( retval != PAPI_OK ) {
		if ( retval == PAPI_ECMP )
			test_skip( __FILE__, __LINE__, "PAPI_thread_init", retval );
		else
			test_fail( __FILE__, __LINE__, "PAPI_thread_init", retval );
	}

#pragma omp parallel shared(Consultas, DB, N_QUERIES, N_DB, N_THREADS, acum, DIM)
	{
		float real_time;
		struct timeval t1, t2;
		int i, j;
		Elem *heap, e_temp;
		double d;
		int n_elem = 0;
		int trid = omp_get_thread_num(); //ID del thread
		int procs = omp_get_num_threads(); //Nro. total de threads
		double suma = 0;

		suma = 0;
		heap = (Elem *) malloc(sizeof (Elem) * TOPK);

#pragma omp barrier

#pragma omp master
		{
			gettimeofday(&t1, 0);
		}

		//Cada hilo accede a un subconjunto de las consultas. Cada hio accede de manera circular al arreglo de consultas.
        for (i = trid; i < N_QUERIES; i += procs) {
         n_elem = 0;
         for (j = 0; j < N_DB; j++) {

            d = distancia(Consultas[i], DB[j]);
				//Si la distancia del objeto a la consulta es menor que la raíz del heap, entonces se inserta en el heap. La raíz siempre mantiene la mayor de las distancias

            if(n_elem<TOPK){
               e_temp.dist = d;
               e_temp.ind = j;
               inserta2(heap, &e_temp, &n_elem);
           }
           if (n_elem==TOPK){
               if (d < topH(heap, &n_elem)) {
                  e_temp.dist = d;
                  e_temp.ind = j;
					//Si el heap no está lleno, se inserta el elemento
                  if (n_elem < TOPK)
                     inserta2(heap, &e_temp, &n_elem);
						//Si el heap está lleno, se inserta el elemento nuevo y se saca el que era antes de mayor de distancia. popush2() hace las operaciones de sacar el elemento mayor e insertar el nuevo.
                 else
                     popush2(heap, &n_elem, &e_temp);
             }}
         }

			//En este punto del código se tienen los K elemntos más cercanos a la consulta en 'heap'. Se pueden extraer con extraer2()
         for (j = 0; j < TOPK ; j++) {
           extrae2(heap, &n_elem, &e_temp);
           answer[i*TOPK+j].ind = e_temp.ind;
           answer[i*TOPK+j].dist = e_temp.dist;
       }
			//Realizamos una operación con los resultados para que el compilador no evite hacer instrucciones que considere que el usuario no utiliza. Simplemente cada hilo suma las distancias de los elementos mas cercanos a la consulta 
   }
   Thread( 1000000 * ( tid + 1 ) );



   fflush(stdout);

#pragma omp barrier

#pragma omp master
   {   

    if ( fip > 0 ) {
		/* Setup PAPI library and begin collecting data from the counters */
       if ( fip == 1 ) {
          if ( ( retval =
             PAPI_flips( &real_time, &proc_time, &flpins,
                &mflops ) ) < PAPI_OK )
             test_fail( __FILE__, __LINE__, "PAPI_flips", retval );
     } else {
      if ( ( retval =
         PAPI_flops( &real_time, &proc_time, &flpins,
            &mflops ) ) < PAPI_OK )
         test_fail( __FILE__, __LINE__, "PAPI_flops", retval );
 }

 gettimeofday(&t2, 0);
 real_time = (t2.tv_sec - t1.tv_sec) + (float) (t2.tv_usec - t1.tv_usec) / 1000000;

 Salida_Multihilo = fopen("Salida_Multihilo.txt", "w");
 for (i = 0; i < N_QUERIES; ++i){
  fprintf(Salida_Multihilo, "Consulta id:: %d\n",i);
  for (j = 0; j < TOPK; ++j){
     fprintf(Salida_Multihilo,"ind = %d :: dist = %f\n",answer[(i*TOPK)+j].ind,answer[(i*TOPK)+j].dist);
 }
 fprintf(Salida_Multihilo, "---------------------------------\n");
}
fclose(Salida_Multihilo);

printf("\n\nK = %d", TOPK);
printf("\nReal Time = %f segundos.\n", real_time);
fflush(stdout);


if ( fip == 1 ) {
  if ( ( retval =
     PAPI_flips( &real_time, &proc_time, &flpins,
        &mflops ) ) < PAPI_OK )
     test_fail( __FILE__, __LINE__, "PAPI_flips", retval );
} else {
  if ( ( retval =
     PAPI_flops( &real_time, &proc_time, &flpins,
        &mflops ) ) < PAPI_OK )
     test_fail( __FILE__, __LINE__, "PAPI_flops", retval );
}

if ( !TESTS_QUIET ) {
  if ( fip == 1 ) {
     printf( "Real_time: %f Proc_time: %f Total flpins: ", real_time,
        proc_time );
 } else {
     printf( "Real_time: %f Proc_time: %f Total flpops: ", real_time,
        proc_time );
 }
 printf( LLDFMT, flpins );
 printf( " MFLOPS: %f\n", mflops );
}
}

}
free(heap);



	}//end pragma omp parallel

	elapsed_cyc = PAPI_get_real_cyc(  ) - elapsed_cyc;
	elapsed_us = PAPI_get_real_usec(  ) - elapsed_us;

	if ( !TESTS_QUIET ) {
		printf( "Master real usec   : \t%lld\n", elapsed_us );
		printf( "Master real cycles : \t%lld\n", elapsed_cyc );
	}

	const PAPI_hw_info_t *hwinfo = NULL;
	const PAPI_mh_tlb_info_t *mhinfo = NULL;
	const  PAPI_mh_cache_info_t *mhcacheinfo = NULL;
	const PAPI_mh_level_t *mhlevel = NULL;


	if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT)
		exit(1);
	if ((hwinfo = PAPI_get_hardware_info()) == NULL)
		exit(1);	
	if ((mhinfo = PAPI_get_hardware_info()) == NULL)
		exit(1);
	if ((mhcacheinfo = PAPI_get_hardware_info()) == NULL)
		exit(1);
	if ((mhlevel = PAPI_get_hardware_info()) == NULL)
		exit(1);

	printf("\n\nA continuación información actual del equipo\n\n");

	printf("MH Type %d - Num entries %d  - Associativity %d \n",mhinfo->type, mhinfo->num_entries, mhinfo->associativity);
	printf("Cache MH type %d size %d line size %d num_lines %d Associativity %d\n\n",mhcacheinfo->type, mhcacheinfo->size,mhcacheinfo->line_size, mhcacheinfo->num_lines, mhcacheinfo->associativity);



    retval=papi_print_header("Available PAPI preset and user defined events plus hardware information.\n",&hwinfo );


    printf("Total hardware flops = %lld\n",(float)values[1]);
    printf("L2 data cache misses is %lld\n", values[0]);






    retval = PAPI_stop_counters(values, NUM_EVENTS);
    return 0;
}
Beispiel #17
0
void
clock_res_check( int flag )
{
	if ( CLOCK_ERROR )
		return;

	long long *elapsed_cyc, total_cyc = 0, uniq_cyc = 0, diff_cyc = 0;
	int i;
	double min, max, average, std, tmp;

	elapsed_cyc = ( long long * ) calloc( NUM_ITERS, sizeof ( long long ) );

	/* Real */
	switch ( flag ) {
	case 0:
		for ( i = 0; i < NUM_ITERS; i++ )
			elapsed_cyc[i] = ( long long ) PAPI_get_real_cyc(  );
		break;
	case 1:
		for ( i = 0; i < NUM_ITERS; i++ )
			elapsed_cyc[i] = ( long long ) PAPI_get_real_usec(  );
		break;
	case 2:
		for ( i = 0; i < NUM_ITERS; i++ )
			elapsed_cyc[i] = ( long long ) PAPI_get_virt_cyc(  );
		break;
	case 3:
		for ( i = 0; i < NUM_ITERS; i++ )
			elapsed_cyc[i] = ( long long ) PAPI_get_virt_usec(  );
		break;
	default:
		test_fail( __FILE__, __LINE__, "clock_res_check", -1 );

	}

	min = max = ( double ) ( elapsed_cyc[1] - elapsed_cyc[0] );

	for ( i = 1; i < NUM_ITERS; i++ ) {
		if ( elapsed_cyc[i] - elapsed_cyc[i - 1] < 0 ) {
			CLOCK_ERROR = 1;
			test_fail( __FILE__, __LINE__, "Negative elapsed time", -1 );
			free( elapsed_cyc );
			return;
		}

		diff_cyc = elapsed_cyc[i] - elapsed_cyc[i - 1];
		if ( min > diff_cyc )
			min = ( double ) diff_cyc;
		if ( max < diff_cyc )
			max = ( double ) diff_cyc;
		if ( diff_cyc != 0 )
			uniq_cyc++;
		total_cyc += diff_cyc;
	}

	average = ( double ) total_cyc / ( NUM_ITERS - 1 );
	std = 0;

	for ( i = 1; i < NUM_ITERS; i++ ) {
		tmp = ( double ) ( elapsed_cyc[i] - elapsed_cyc[i - 1] );
		tmp = tmp - average;
		std += tmp * tmp;
	}

	std = sqrt( std / ( NUM_ITERS - 2 ) );
	printf( "%s: min %.3lf  max %.3lf \n", func_name[flag], min, max );
	printf( "                   average %.3lf std %.3lf\n", average, std );

	if ( !TESTS_QUIET ) {
		if ( uniq_cyc == NUM_ITERS - 1 ) {
			printf( "%s : %7.3f   <%7.3f\n", func_name[flag],
					( double ) total_cyc / ( double ) ( NUM_ITERS ),
					( double ) total_cyc / ( double ) uniq_cyc );
		} else if ( uniq_cyc ) {
			printf( "%s : %7.3f    %7.3f\n", func_name[flag],
					( double ) total_cyc / ( double ) ( NUM_ITERS ),
					( double ) total_cyc / ( double ) uniq_cyc );
		} else {
			printf( "%s : %7.3f   >%7.3f\n", func_name[flag],
					( double ) total_cyc / ( double ) ( NUM_ITERS ),
					( double ) total_cyc );
		}
	}

	free( elapsed_cyc );
}
Beispiel #18
0
int main(int argc, char **argv)
{
   int retval, num_tests = 2, eventcnt, events[2], i, tmp;
   int EventSet1 = PAPI_NULL, EventSet2 = PAPI_NULL;
   int PAPI_event;
   long_long values1[2], values2[2];
   long_long elapsed_cyc;
   char event_name[PAPI_MAX_STR_LEN], add_event_str[PAPI_MAX_STR_LEN];


   retval = PAPI_library_init(PAPI_VER_CURRENT);

      retval = PAPI_set_debug(PAPI_VERB_ECONT);

   /* query and set up the right instruction to monitor */
   if (PAPI_query_event(PAPI_FP_OPS) == PAPI_OK)
      PAPI_event = PAPI_FP_OPS;
   else
      PAPI_event = PAPI_TOT_INS;

   retval = PAPI_event_code_to_name(PAPI_event, event_name);
   sprintf(add_event_str, "PAPI_add_event[%s]", event_name);

   retval = PAPI_create_eventset(&EventSet1);

   /* Add the events */

   retval = PAPI_add_event(EventSet1, PAPI_event);

   retval = PAPI_add_event(EventSet1, PAPI_TOT_CYC);

   /* Add them reversed to EventSet2 */

   retval = PAPI_create_eventset(&EventSet2);

   eventcnt = 2;
   retval = PAPI_list_events(EventSet1, events, &eventcnt);

   for (i = eventcnt - 1; i >= 0; i--) {
      retval = PAPI_event_code_to_name(events[i], event_name);

      retval = PAPI_add_event(EventSet2, events[i]);
   }

   elapsed_cyc = PAPI_get_real_cyc();

   retval = PAPI_start(EventSet1);

   do_flops(NUM_FLOPS);

   retval = PAPI_stop(EventSet1, values1);

   retval = PAPI_start(EventSet2);

   do_flops(NUM_FLOPS);

   retval = PAPI_stop(EventSet2, values2);


   elapsed_cyc = PAPI_get_real_cyc() - elapsed_cyc;

   retval = PAPI_cleanup_eventset(EventSet1);   /* JT */

   retval = PAPI_destroy_eventset(&EventSet1);

   retval = PAPI_cleanup_eventset(EventSet2);   /* JT */

   retval = PAPI_destroy_eventset(&EventSet2);

      printf("Test case 0: start, stop.\n");
      printf("-----------------------------------------------\n");
      tmp = PAPI_get_opt(PAPI_DEFDOM, NULL);
      tmp = PAPI_get_opt(PAPI_DEFGRN, NULL);
      printf("Using %d iterations of c += a*b\n", NUM_FLOPS);
      printf
          ("-------------------------------------------------------------------------\n");

      printf("Test type    : \t           1\t           2\n");

      printf("%ld %ld\n", values1[0], values2[1]);
      printf("%d %d\n", "PAPI_TOT_INS : \t", values1[1], values2[0]);
      printf("%ld\n", "Real cycles  : \t", elapsed_cyc);

      printf
          ("-------------------------------------------------------------------------\n");

      printf("Verification: none\n");
   exit(1);
}
Beispiel #19
0
void *
Thread( void *arg )
{
	int retval, num_tests = 1, i;
	int EventSet1 = PAPI_NULL, mask1, PAPI_event;
	int num_events1;
	long long **values;
	long long elapsed_us, elapsed_cyc;
	unsigned short *profbuf;
	char event_name[PAPI_MAX_STR_LEN];

	retval = PAPI_register_thread(  );
	if ( retval != PAPI_OK )
		test_fail( __FILE__, __LINE__, "PAPI_register_thread", retval );
	profbuf = ( unsigned short * ) malloc( length * sizeof ( unsigned short ) );
	if ( profbuf == NULL )
		exit( 1 );
	memset( profbuf, 0x00, length * sizeof ( unsigned short ) );

	/* add PAPI_TOT_CYC and one of the events in PAPI_FP_INS, PAPI_FP_OPS or
	   PAPI_TOT_INS, depends on the availability of the event on the
	   platform */
	EventSet1 =
		add_two_nonderived_events( &num_events1, &PAPI_event, &mask1 );

	values = allocate_test_space( num_tests, num_events1 );

	if ( ( retval =
		   PAPI_event_code_to_name( PAPI_event, event_name ) ) != PAPI_OK )
		test_fail( __FILE__, __LINE__, "PAPI_event_code_to_name", retval );

	elapsed_us = PAPI_get_real_usec(  );

	elapsed_cyc = PAPI_get_real_cyc(  );

	retval = PAPI_profil( profbuf, length, my_start, 65536,
						  EventSet1, PAPI_event, THR, PAPI_PROFIL_POSIX );
	if ( retval )
		test_fail( __FILE__, __LINE__, "PAPI_profil", retval );

	if ( ( retval = PAPI_start( EventSet1 ) ) != PAPI_OK )
		test_fail( __FILE__, __LINE__, "PAPI_start", retval );

	do_flops( *( int * ) arg );

	if ( ( retval = PAPI_stop( EventSet1, values[0] ) ) != PAPI_OK )
		test_fail( __FILE__, __LINE__, "PAPI_stop", retval );

	elapsed_us = PAPI_get_real_usec(  ) - elapsed_us;

	elapsed_cyc = PAPI_get_real_cyc(  ) - elapsed_cyc;

	/* to remove the profile flag */
	retval = PAPI_profil( profbuf, length, my_start, 65536,
						  EventSet1, PAPI_event, 0, PAPI_PROFIL_POSIX );
	if ( retval )
		test_fail( __FILE__, __LINE__, "PAPI_profil", retval );


	remove_test_events( &EventSet1, mask1 );

	if ( !TESTS_QUIET ) {
		if ( mask1 == 0x3 ) {
			printf( "Thread 0x%x PAPI_TOT_INS : \t%lld\n",
					( int ) pthread_self(  ), ( values[0] )[0] );
		} else {
			printf( "Thread 0x%x PAPI_FP_INS : \t%lld\n",
					( int ) pthread_self(  ), ( values[0] )[0] );
		}
		printf( "Thread 0x%x PAPI_TOT_CYC: \t%lld\n", ( int ) pthread_self(  ),
				( values[0] )[1] );
		printf( "Thread 0x%x Real usec   : \t%lld\n", ( int ) pthread_self(  ),
				elapsed_us );
		printf( "Thread 0x%x Real cycles : \t%lld\n", ( int ) pthread_self(  ),
				elapsed_cyc );

		printf( "Test case: PAPI_profil() for pthreads\n" );
		printf( "----Profile buffer for Thread 0x%x---\n",
				( int ) pthread_self(  ) );
		for ( i = 0; i < ( int ) length; i++ ) {
			if ( profbuf[i] )
				printf( "0x%lx\t%d\n", ( unsigned long ) ( my_start + 2 * i ),
						profbuf[i] );
		}
	}
	for ( i = 0; i < ( int ) length; i++ )
		if ( profbuf[i] )
			break;

	if ( i >= ( int ) length )
		test_fail( __FILE__, __LINE__, "No information in buffers", 1 );
	free_test_space( values, num_tests );

	retval = PAPI_unregister_thread(  );
	if ( retval != PAPI_OK )
		test_fail( __FILE__, __LINE__, "PAPI_unregister_thread", retval );
	return ( NULL );
}
Beispiel #20
0
int main(){

  /************************************/

  long_long   checksum = 0;
  int i,j,k;

  for (i = 0; i < N; ++i)
    for (j = 0; j < N; ++j){
      mul1[i][j]= (i+j)   % 8 + 1;
      mul2[i][j]= (N-i+j) % 8 + 1;
      res[i][j] = 0;
    }

  /************************************/

  int retval, EventSet=PAPI_NULL;
  long_long values[3];
  long_long start_cycles, end_cycles, start_usec, end_usec;

  /* Initialize the PAPI library */
  retval = PAPI_library_init(PAPI_VER_CURRENT);
  if (retval != PAPI_VER_CURRENT) {
    fprintf(stderr, "PAPI library init error!\n");
    exit(1);
  }
 
  /* Create the Event Set */
  if (PAPI_create_eventset(&EventSet) != PAPI_OK)
    handle_error(1, "create_eventset");

  /* Add L1 data cache misses to the Event Set */
  if (PAPI_add_event(EventSet,PAPI_L1_DCM) != PAPI_OK)
    handle_error(1,"add_event - L1_DCM");
  /* Add load instructions completed to the Event Set */
  if (PAPI_add_event(EventSet,PAPI_LD_INS) != PAPI_OK)
    handle_error(1,"add_event - LD_INS");
  /* Add store instructions completed to the Event Set */
  if (PAPI_add_event(EventSet,PAPI_SR_INS) != PAPI_OK)
    handle_error(1,"add_event - SR_INS");
 
  /* Reset the counting events in the Event Set */
  if (PAPI_reset(EventSet) != PAPI_OK)
    handle_error(1,"reset");

  /* Read the counting of events in the Event Set */
  if (PAPI_read(EventSet, values) != PAPI_OK)
    handle_error(1,"read");

  printf("After resetting counter 'PAPI_L1_DCM' [x10^6]: %f\n", \
        (double)(values[0])/1000000);
  printf("After resetting counter 'PAPI_LD_INS' [x10^6]: %f\n", \
	(double)(values[1])/1000000);
  printf("After resetting counter 'PAPI_SR_INS' [x10^6]: %f\n", \
	(double)(values[2])/1000000);

  /* Start counting events in the Event Set */
  if (PAPI_start(EventSet) != PAPI_OK)
    handle_error(1,"start");
 
  /* Gets the starting time in clock cycles */
  start_cycles = PAPI_get_real_cyc();
 
  /* Gets the starting time in microseconds */
  start_usec = PAPI_get_real_usec();

  /************************************/
  /*      MATRIX MULTIPLICATION       */
  /************************************/

  for (i = 0; i < N; ++i)
    for (j = 0; j < N; ++j)
      for (k = 0; k < N; ++k)
        res[i][j] += mul1[i][k] * mul2[k][j];

  /************************************/

  /* Gets the ending time in clock cycles */
  end_cycles = PAPI_get_real_cyc();
 
  /* Gets the ending time in microseconds */
  end_usec = PAPI_get_real_usec();

  /* Stop the counting of events in the Event Set */
  if (PAPI_stop(EventSet, values) != PAPI_OK)
    handle_error(1,"stop");

  printf("After stopping counter 'PAPI_L1_DCM'  [x10^6]: %f\n", \
	 (double)(values[0])/1000000);
  printf("After stopping counter 'PAPI_LD_INS'  [x10^6]: %f\n", \
	 (double)(values[1])/1000000);
  printf("After stopping counter 'PAPI_SR_INS'  [x10^6]: %f\n", \
	 (double)(values[2])/1000000);

  printf("Wall clock cycles [x10^6]: %f\n",           \
        (double)(end_cycles - start_cycles)/1000000);
  printf("Wall clock time [seconds]: %f\n",           \
        (double)(end_usec - start_usec)/1000000); 

  for (i = 0; i < N; ++i)
    for (j = 0; j < N; ++j)
      checksum+=res[i][j];
  printf("Matrix checksum: %lld\n", checksum); 

  return(0);
}
int main(int argc, char **argv)
{
   int i, retval, EventSet = PAPI_NULL;
   int bins = 100;
   int show_dist = 0, show_std_dev = 0;
   long long totcyc, values[2];
   long long *array;


   tests_quiet(argc, argv);     /* Set TESTS_QUIET variable */

   for (i = 0; i < argc; i++) {
      if (argv[i]) {
         if (strstr(argv[i], "-b")) {
            bins = atoi(argv[i+1]);
            if (bins) i++;
            else {
               printf ("-b requires a bin count!\n");
               exit(1);
            }
         }
         if (strstr(argv[i], "-d"))
            show_dist = 1;
         if (strstr(argv[i], "-h")) {
            print_help();
            exit(1);
         }
         if (strstr(argv[i], "-s"))
            show_std_dev = 1;
         if (strstr(argv[i], "-t")) {
            num_iters = atol(argv[i+1]);
            if (num_iters) i++;
            else {
               printf ("-t requires a threshold value!\n");
               exit(1);
            }
         }
      }
   }

   printf("Cost of execution for PAPI start/stop, read and accum.\n");
   printf("This test takes a while. Please be patient...\n");

   if ((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT)
      test_fail(__FILE__, __LINE__, "PAPI_library_init", retval);
   if ((retval = PAPI_set_debug(PAPI_VERB_ECONT)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_set_debug", retval);
   if ((retval = PAPI_query_event(PAPI_TOT_CYC)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_query_event", retval);
   if ((retval = PAPI_query_event(PAPI_TOT_INS)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_query_event", retval);
   if ((retval = PAPI_create_eventset(&EventSet)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_create_eventset", retval);

   if ((retval = PAPI_add_event(EventSet, PAPI_TOT_CYC)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_add_event", retval);

   if ((retval = PAPI_add_event(EventSet, PAPI_TOT_INS)) != PAPI_OK)
      if ((retval = PAPI_add_event(EventSet, PAPI_TOT_IIS)) != PAPI_OK)
         test_fail(__FILE__, __LINE__, "PAPI_add_event", retval);

   /* Make sure no errors and warm up */

   totcyc = PAPI_get_real_cyc();
   if ((retval = PAPI_start(EventSet)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_start", retval);
   if ((retval = PAPI_stop(EventSet, NULL)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_stop", retval);

   array = (long long *)malloc(num_iters*sizeof(long long));
   if (array == NULL ) 
      test_fail(__FILE__, __LINE__, "PAPI_stop", retval);

   /* Determine clock latency */

   printf("\nPerforming loop latency test...\n");

   for (i = 0; i < num_iters; i++) {
      totcyc = PAPI_get_real_cyc();
      totcyc = PAPI_get_real_cyc() - totcyc;
      array[i] = totcyc;
   }

   do_output(0, array, bins, show_std_dev, show_dist);

   /* Start the start/stop eval */

   printf("\nPerforming start/stop test...\n");

   for (i = 0; i < num_iters; i++) {
      totcyc = PAPI_get_real_cyc();
      PAPI_start(EventSet);
      PAPI_stop(EventSet, values);
      totcyc = PAPI_get_real_cyc() - totcyc;
      array[i] = totcyc;
   }

   do_output(1, array, bins, show_std_dev, show_dist);

   /* Start the read eval */
   printf("\nPerforming read test...\n");

   if ((retval = PAPI_start(EventSet)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_start", retval);
   PAPI_read(EventSet, values);

   for (i = 0; i < num_iters; i++) {
      totcyc = PAPI_get_real_cyc();
      PAPI_read(EventSet, values);
      totcyc = PAPI_get_real_cyc() - totcyc;
      array[i] = totcyc;
   }
   if ((retval = PAPI_stop(EventSet, values)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_stop", retval);

   do_output(2, array, bins, show_std_dev, show_dist);

   /* Start the read with timestamp eval */
   printf("\nPerforming read with timestamp test...\n");

   if ((retval = PAPI_start(EventSet)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_start", retval);
   PAPI_read_ts(EventSet, values, &totcyc);

   for (i = 0; i < num_iters; i++) {
      PAPI_read_ts(EventSet, values, &array[i]);
   }
   if ((retval = PAPI_stop(EventSet, values)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_stop", retval);

   /* post-process the timing array */
   for (i = num_iters - 1; i > 0 ; i--) {
      array[i] -= array[i-1];
   }
   array[0] -= totcyc;

   do_output(3, array, bins, show_std_dev, show_dist);

   /* Start the accum eval */
   printf("\nPerforming accum test...\n");

   if ((retval = PAPI_start(EventSet)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_start", retval);
   PAPI_accum(EventSet, values);

   for (i = 0; i < num_iters; i++) {
      totcyc = PAPI_get_real_cyc();
      PAPI_accum(EventSet, values);
      totcyc = PAPI_get_real_cyc() - totcyc;
      array[i] = totcyc;
   }
   if ((retval = PAPI_stop(EventSet, values)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_stop", retval);

   do_output(4, array, bins, show_std_dev, show_dist);

   /* Start the reset eval */
   printf("\nPerforming reset test...\n");

   if ((retval = PAPI_start(EventSet)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_start", retval);

   for (i = 0; i < num_iters; i++) {
      totcyc = PAPI_get_real_cyc();
      PAPI_reset(EventSet);
      totcyc = PAPI_get_real_cyc() - totcyc;
      array[i] = totcyc;
   }
   if ((retval = PAPI_stop(EventSet, values)) != PAPI_OK)
      test_fail(__FILE__, __LINE__, "PAPI_stop", retval);

   do_output(5, array, bins, show_std_dev, show_dist);

   free(array);
   test_pass(__FILE__, NULL, 0);
   exit(1);
}
uint64_t vt_metric_real_cyc(void)
{
  return (uint64_t)PAPI_get_real_cyc();
}
Beispiel #23
0
int main(int argc, char *argv[]) {

    if (argc < 4) {
	   printf("Usage: %s data_type(text or bin) input_file output_file\n", argv[0]);
	   return EXIT_FAILURE;
    }

    char *file_type = argv[1]; 
    char *file_in = argv[2];
    char *file_out = argv[3];
    char *str1 = "SU.vtk";
    char *str2 = "VAR.vtk";
    char *str3 = "CGUP.vtk";
    char *file_perf = "pstats.dat";    
    int status = 0;
    /** internal cells start and end index*/
    int nintci, nintcf;
    /** external cells start and end index. The external cells are only ghost cells. 
    They are accessed only through internal cells*/
    int nextci, nextcf;
    /** link cell-to-cell array. Stores topology information*/
    int **lcc;
    /** red-black colouring of the cells*/
    int *nboard;
    /** boundary coefficients for each volume cell */
    double *bs, *be, *bn, *bw, *bl, *bh, *bp, *su;
    /**parameter used for volmesh and reading binary input file */
    int* nodeCnt;
    int*** points;
    int*** elems;
    /**Measured Performance and Papi parameters*/
    long long *values_i = (long long *) calloc(sizeof(long long), 4);
    long long *values_c = (long long *) calloc(sizeof(long long), 4);
    long long *values_o = (long long *) calloc(sizeof(long long), 4);  
    double *mflops = (double *) calloc(sizeof(double), 3);
    double *L1mira = (double *) calloc(sizeof(double), 3);
    double *Lmirate = (double *) calloc(sizeof(double), 3);
    double *util = (double *) calloc(sizeof(double), 3);
    long long *et = (long long *) calloc(sizeof(long long), 3);
    long long start_cycles, start_usec,end_cycles_1, end_usec_1, end_cycles_2, end_cycles_3, end_usec_2, end_usec_3;
    /**In cluster mpp_inter L1 and L2 events can not computed at the same time, 
    so set into two groups*/
    int Events[NUM_EVENTS]={PAPI_L2_TCM,PAPI_L2_TCA,PAPI_FP_INS,PAPI_TOT_CYC};
    // int Events[NUM_EVENTS]={PAPI_L1_TCM,PAPI_L1_TCA,PAPI_FP_INS,PAPI_TOT_CYC};
    /**start HW counters and execution time recorder*/
    if ( PAPI_start_counters( Events, NUM_EVENTS ) != PAPI_OK )
    printf("Fail to start PAPI counter\n");    
    start_cycles = PAPI_get_real_cyc(); // Gets the starting time in clock cycles
    start_usec = PAPI_get_real_usec(); // Gets the starting time in microseconds
    /* initialization  */
    // read-in the input file
    int f_status;
    if (strcmp(file_type,"text") == 0) {

        f_status = read_formatted(file_in, &nintci, &nintcf, &nextci, &nextcf, &lcc,
		   &bs, &be, &bn, &bw, &bl, &bh, &bp, &su, &nboard);
    } else if (strcmp(file_type,"bin") == 0) { 

        f_status = read_formatted_bin(file_in, &nintci, &nintcf, &nextci,
                   &nextcf, &lcc, &bs, &be, &bn, &bw,
                   &bl, &bh, &bp, &su,&nboard);
    } else { 

        printf ("Input file format is nor correct\n");
             return EXIT_FAILURE;
    }
    if (f_status != 0){	

        printf("failed to initialize data!\n");
	return EXIT_FAILURE;
    }
    // allocate arrays used in gccg
    int nomax = 3;
    /** the reference residual*/
    double resref = 0.0;
    /** the ratio between the reference and the current residual*/
    double ratio;
    /** array storing residuals */
    double* resvec = (double *) calloc(sizeof(double), (nintcf + 1));
    /** the variation vector -> keeps the result in the end */
    double* var = (double *) calloc(sizeof(double), (nextcf + 1));
    /** the computation vectors */
    double* direc1 = (double *) calloc(sizeof(double), (nextcf + 1));
    double* direc2 = (double *) calloc(sizeof(double), (nextcf + 1));
    /** additional vectors */
    double* cgup = (double *) calloc(sizeof(double), (nextcf + 1));
    double* oc = (double *) calloc(sizeof(double), (nintcf + 1));
    double* cnorm = (double *) calloc(sizeof(double), (nintcf + 1));
    double* adxor1 = (double *) calloc(sizeof(double), (nintcf + 1));
    double* adxor2 = (double *) calloc(sizeof(double), (nintcf + 1));
    double* dxor1 = (double *) calloc(sizeof(double), (nintcf + 1));
    double* dxor2 = (double *) calloc(sizeof(double), (nintcf + 1));
    /**store volume information*/
    int nc=0;
    // initialize the reference residual
    for ( nc = nintci; nc <= nintcf; nc++) {
	  resvec[nc] = su[nc];
	  resref = resref + resvec[nc] * resvec[nc];
    }
    resref = sqrt(resref);
    if (resref < 1.0e-15){

	printf("i/o - error: residue sum less than 1.e-15 - %lf\n", resref);
	return EXIT_FAILURE;
    }

    // initialize the arrays
    for (nc = 0; nc <= 10; nc++){
	oc[nc] = 0.0;
	cnorm[nc] = 1.0;
    }

    for (nc = nintci; nc <= nintcf; nc++){
	cgup[nc] = 0.0;
	var[nc] = 0.0;
    }

    for (nc = nextci; nc <= nextcf; nc++){
	var[nc] = 0.0;
	cgup[nc] = 0.0;
	direc1[nc] = 0.0;
	bs[nc] = 0.0;
	be[nc] = 0.0;
	bn[nc] = 0.0;
	bw[nc] = 0.0;
	bl[nc] = 0.0;
	bh[nc] = 0.0;
    }

    for (nc = nintci; nc <= nintcf; nc++){
	cgup[nc] = 1.0 / bp[nc];
    }
    int if1 = 0;
    int if2 = 0;
    int iter = 1;
    int nor = 1;
    int nor1 = nor - 1;
	
    /* finished initalization */
    /*read PAPI HW counters and caculate performance of input phase*/
    if ( PAPI_read_counters( values_i, NUM_EVENTS ) != PAPI_OK ){ 
 	 printf("fail to stop papi counter");
    }
    Lmirate[0] = (double) values_i[0] / values_i[1];
    end_usec_1 = PAPI_get_real_usec(); 
    mflops[0] = (double) values_i[2] / (end_usec_1-start_usec);
    util[0] = mflops[0] / PEAKPER;

    /* start computation loop */
    while (iter < 10000){

    /* start phase 1 */
    // update the old values of direc
    for (nc = nintci; nc <= nintcf; nc++){
	direc1[nc] = direc1[nc] + resvec[nc] * cgup[nc];
    }

    // compute new guess (approximation) for direc
    for (nc = nintci; nc <= nintcf; nc++){
	direc2[nc] = bp[nc] * direc1[nc] - bs[nc] * direc1[lcc[0][nc]]
			- bw[nc] * direc1[lcc[3][nc]] - bl[nc] * direc1[lcc[4][nc]]
			- bn[nc] * direc1[lcc[2][nc]] - be[nc] * direc1[lcc[1][nc]]
			- bh[nc] * direc1[lcc[5][nc]];
    } /* end phase 1 */
	
    /*  start phase 2 */
    // execute normalization steps
    double oc1, oc2, occ;
    if (nor1 == 1){
        oc1 = 0;
        occ = 0;
    for (nc = nintci; nc <= nintcf; nc++){	
	occ = occ + adxor1[nc] * direc2[nc];
    }
         oc1 = occ / cnorm[1];
    for (nc = nintci; nc <= nintcf; nc++){
	direc2[nc] = direc2[nc] - oc1 * adxor1[nc];
	direc1[nc] = direc1[nc] - oc1 * dxor1[nc];
    }
    if1++;
    }else if (nor1 == 2){
	oc1 = 0;
	occ = 0;
    for (nc = nintci; nc <= nintcf; nc++){
	occ = occ + adxor1[nc] * direc2[nc];
    }
	oc1 = occ / cnorm[1];
	oc2 = 0;
	occ = 0;
    for (nc = nintci; nc <= nintcf; nc++){
	occ = occ + adxor2[nc] * direc2[nc];
    }
        oc2 = occ / cnorm[2];
    for (nc = nintci; nc <= nintcf; nc++){
	direc2[nc] = direc2[nc] - oc1 * adxor1[nc] - oc2 * adxor2[nc];
	direc1[nc] = direc1[nc] - oc1 * dxor1[nc] - oc2 * dxor2[nc];
    }

    if2++;
    }

    cnorm[nor] = 0;
    double omega = 0;

    // compute the new residual
    for (nc = nintci; nc <= nintcf; nc++){
	cnorm[nor] = cnorm[nor] + direc2[nc] * direc2[nc];
	omega = omega + resvec[nc] * direc2[nc];
    }
    omega = omega / cnorm[nor];
    double resnew = 0.0;
    for (nc = nintci; nc <= nintcf; nc++){
	var[nc] = var[nc] + omega * direc1[nc];
	resvec[nc] = resvec[nc] - omega * direc2[nc];
	resnew = resnew + resvec[nc] * resvec[nc];
    }
    resnew = sqrt(resnew);
    ratio = resnew / resref;

    // exit on no improvements of residual
    if (ratio <= 1.0e-10){
	break;
    }
    iter++;

    // prepare additional arrays for the next iteration step
    if (nor == nomax){
	nor = 1;
    }else{
    if (nor == 1){
    for (nc = nintci; nc <= nintcf; nc++){
	dxor1[nc] = direc1[nc];	
	adxor1[nc] = direc2[nc];
    }
    } else if (nor == 2){
    for (nc = nintci; nc <= nintcf; nc++){
	 dxor2[nc] = direc1[nc];
  	 adxor2[nc] = direc2[nc];
    }
    }
         nor++;
    }
         nor1 = nor - 1;

    }/* end phase 2 */

    /* finished computation loop */
    /*read PAPI HW counters and caculate performance of computation phase*/
    end_cycles_2 = PAPI_get_real_cyc(); // Gets the ending time in clock cycles
    end_usec_2 = PAPI_get_real_usec(); // Gets the ending time in microseconds
    if ( PAPI_read_counters( values_c, NUM_EVENTS ) != PAPI_OK ){ 
         printf("fail to read papi counter");
    }

    Lmirate[1] = (double) values_c[0]/values_c[1];
    mflops[1] = (double) values_c[2] / ( end_usec_2-end_usec_1 );
    util[1] = mflops[1] / PEAKPER;
    /* write output file  */
    
    if ( write_result(file_in, file_out, nintci, nintcf, var, iter, ratio) != 0 )
    printf("error when trying to write to file %s\n", file_out);
    
    //transfer volume to mesh
    if (vol2mesh(nintci, nintcf, lcc, &nodeCnt, &points, &elems) != 0 ){ 
        printf("error when trying to converge topology to volume");
    }   
    //write output to vtk file    
    if (write_result_vtk(str1, nintci, nintcf, nodeCnt, points, elems, su) != 0){
       printf("error when write SU to vtk file");
    }
    if (write_result_vtk(str2, nintci, nintcf, nodeCnt, points, elems, var) != 0){
       printf("error when write VAR to vtk file");
    }
    if (write_result_vtk(str3, nintci, nintcf, nodeCnt, points, elems, cgup) != 0){
       printf("error when write CGUP to vtk file");
    }
    /*read PAPI HW counters and caculate performance of output phase*/  
    if ( PAPI_stop_counters( values_o, NUM_EVENTS ) != PAPI_OK ){ 
         printf("fail to stop papi counter");
    } 
    
    Lmirate[2] = (double) values_o[0]/values_o[1];
    end_cycles_3 = PAPI_get_real_cyc(); // Gets the ending time in clock cycles
    end_usec_3 = PAPI_get_real_usec(); // Gets the ending time in microseconds 
    mflops[2] = (double) (values_o[2])/(end_usec_3-end_usec_2);
    util[2] = mflops[2] / PEAKPER;
    /** Write all measured performance to pstats.dat*/
    et[0] = end_usec_1-start_usec;
    et[1] = end_usec_2-end_usec_1;
    et[2] = end_usec_3-end_usec_2;
    if (write_result_dat(file_perf, values_i,values_c, values_o,Lmirate, et, mflops, util) != 0 ){
        printf("error when write measured performance to data file");
    }
    /* Free all the dynamically allocated memory */
    free(direc2); free(direc1); free(dxor2); free(dxor1); free(adxor2); free(adxor1);
    free(cnorm); free(oc); free(var); free(cgup); free(resvec); free(su); free(bp);
    free(bh); free(bl); free(bw); free(bn); free(be); free(bs);
    printf("Simulation completed successfully!\n");
    return EXIT_SUCCESS;
    }