//
// This method should be placed at the start of instrumented code
//
void startPapiCounters(){
    initializeCounters(0);
#ifdef DBG
    printGEvents();
    printf("********* STARTING COUNTERS *************\n");
    //assert(NUM_EVENTS == _G_EVENT_COUNT);
#endif
    // initialize papi library and assert that it's successful
    _CALL_PAPI(PAPI_library_init( PAPI_VER_CURRENT ));    
    
    // check that all the events can be counted at once.
    int numCounters = PAPI_num_counters() ;
    assert( NUM_EVENTS <= numCounters );

    
#ifdef DBG
    printf("Number of hardware counters available on this machine: %d", numCounters);
#endif

    for ( int i = 0; i < NUM_EVENTS; i++ ) {
        char name[PAPI_MAX_STR_LEN];
        (void) _CALL_PAPI(PAPI_event_code_to_name( _G_EVENTS[i], name ));
        if(PAPI_query_event( _G_EVENTS[i] ) < PAPI_OK) {
            fprintf(stderr, "Event %s could not be counted on this machine.\n", name);
            abort();
        }
    }

    //*******  Start Counters ******
    (void) _CALL_PAPI(PAPI_start_counters(_G_EVENTS, NUM_EVENTS));
}
Beispiel #2
0
int main(int argc, char **argv) {
  int m = atoi(argv[1]);
  int k = atoi(argv[2]);
  int n = atoi(argv[3]);

  float *A = (float*) malloc(m * k * sizeof(float));
  float *B = (float*) malloc(k * n * sizeof(float));
  float *C = (float*) malloc(m * n * sizeof(float));

  int Events[] = {PAPI_FP_INS, PAPI_TOT_CYC};
  long_long values[2];
#define NUM_EVENTS 2

  initialize(m, k, n, A, B, C);

  /* Start counting events */
  if (PAPI_start_counters(Events, NUM_EVENTS) != PAPI_OK)
	  exit(10);

  multiply(m, k, n, A, B, C);

  /* Stop counting events */
  if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK)
	  exit(10);

  printf("Counter values: %ld, %ld\n", values[0], values[1]);

  free(A);
  free(B);
  free(C);
  return 0;
}
int main(int argc, char** argv) {
  int Events[NUM_EVENTS]; 
  const char* names[NUM_EVENTS] = {"OPEN_CALLS", "OPEN_FDS", "READ_CALLS", "READ_BYTES", "READ_USEC", "READ_ERR", "READ_INTERRUPTED", "READ_WOULD_BLOCK", "WRITE_CALLS","WRITE_BYTES","WRITE_USEC", "WRITE_WOULD_BLOCK"};
  long long values[NUM_EVENTS];

  /* Set TESTS_QUIET variable */
  tests_quiet( argc, argv );

  int version = PAPI_library_init (PAPI_VER_CURRENT);
  if (version != PAPI_VER_CURRENT) {
    fprintf(stderr, "PAPI_library_init version mismatch\n");
    exit(1);
  }

  if (!TESTS_QUIET) fprintf(stderr, "This program will read from stdin and echo it to stdout\n");
  int retval;
  int e;
  for (e=0; e<NUM_EVENTS; e++) {
    retval = PAPI_event_name_to_code((char*)names[e], &Events[e]);
    if (retval != PAPI_OK) {
      fprintf(stderr, "Error getting code for %s\n", names[e]);
      exit(2);
    } 
  }

  /* Start counting events */
  if (PAPI_start_counters(Events, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_start_counters\n");
    exit(1);
  }

  int bytes = 0;
  char buf[1024];

 
//if (PAPI_read_counters(values, NUM_EVENTS) != PAPI_OK)
//   handle_error(1);
//printf("After reading the counters: %lld\n",values[0]);

  while ((bytes = read(0, buf, 1024)) > 0) {
    write(1, buf, bytes);
  }


  /* Stop counting events */
  if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_stop_counters\n");
  }
 
  if (!TESTS_QUIET) { 
    printf("----\n");
    for (e=0; e<NUM_EVENTS; e++)  
      printf("%s: %lld\n", names[e], values[e]);
  }
  test_pass( __FILE__, NULL, 0 );
  return 0;
}
void papi_base::start( ) {
	
	std::vector<int> eventsMutable( counters_.data(), counters_.data() + counters_.size() );
	int retval = PAPI_start_counters( &eventsMutable[ 0 ], counters_.size() );
	if (retval == PAPI_OK) {
		papi_started_ = true;
	} else {
		std::cerr << "PAPI error " << retval << ": " << PAPI_strerror( retval ) << std::endl;
		papi_started_ = false;
	}
}
int main(int argc, char **argv) {

	int retval;

	retval = PAPI_library_init(PAPI_VER_CURRENT);
	if (retval != PAPI_VER_CURRENT) {
		fprintf(stderr,"Error! PAPI_library_init %d\n", retval);
	}

	retval = PAPI_query_event(PAPI_TOT_INS);
	if (retval != PAPI_OK) {
		fprintf(stderr,"PAPI_TOT_INS not supported\n");
		exit(1);
	}

	int i;
	int events[1],result;
	long long counts[1];

	long long total=0,average,max=0,min=0x7ffffffffffffffULL;

	events[0]=PAPI_TOT_INS;

	PAPI_start_counters(events,1);

	for(i=0;i<NUM_RUNS;i++) {


		result=instructions_million();

		PAPI_read_counters(counts,1);

		results[i]=counts[0];

 	}

	PAPI_stop_counters(counts,1);


	PAPI_shutdown();

	for(i=0;i<NUM_RUNS;i++) {
		total+=results[i];
		if (results[i]>max) max=results[i];
		if (results[i]<min) min=results[i];
	}

	average=total/NUM_RUNS;
	printf("Average=%lld max=%lld min=%lld\n",average,max,min);

	(void) result;

	return 0;
}
Beispiel #6
0
void my_papi_start(int *events, int NUM_EVENTS)
{
	values = (long long *)malloc(sizeof(long long)*NUM_EVENTS);
	int ret;
	
	/* Start counting events */
	if ((ret = PAPI_start_counters(events, NUM_EVENTS)) != PAPI_OK) {
		 fprintf(stderr, "PAPI failed to start counters: %s\n", PAPI_strerror(ret));
		 // fprintf(stderr, "PAPI_start_counters - FAILED\n");
			exit(1);
	}
}
Beispiel #7
0
int main (int argc, char *argv[]) {
	int i, count;
	int *array = (int*) malloc (SIZE * sizeof(int));
	uint64_t start, end;
    int events[3] = { PAPI_L1_DCM, PAPI_L2_DCM, PAPI_L3_DCM };
    long long misses[3];
    int papilevels = 3;

    if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) {
        exit(1);
    }

	//Initialization
	count = 0;
	srand(time(NULL));
	init_time();
	for (i = 0; i < SIZE; i++)
		array[i] = rand();

	
	// Uncomment this line to 
	qsort(array, SIZE, sizeof (int), compare_ints);

	//Measurement
    while (PAPI_start_counters(events, papilevels) != PAPI_OK) {
        papilevels--;
    }
	start = get_time();
	/*
	 * É possível, em um vetor ordenado, fazer a contagem 
	 * em tempo O(lg(n)) em vez de O(n) utilizando busca 
	 * binária. Embora isto diminua sensivelmente o tempo 
	 * este não é o intuito deste exercício.
	 *
	 */
	for (i = 0; i < SIZE; i++)
		if (array[i] < RAND_MAX / 2)
			count++;
	end = get_time();
	uint64_t exec_time = diff_time(start, end);
    if (PAPI_read_counters(misses, papilevels) != PAPI_OK) {
        fprintf(stderr, "Erro em PAPI_read_counters\n");
        exit(1);
    }

	printf("Time: %" PRIu64 " Count %d\n",  exec_time, count);
    for (i = 0; i < papilevels; i++) {
        printf("Cache misses (L%d): %lld\n", i+1, misses[i]);
    }
	free(array);
	return 0;
}
Beispiel #8
0
void bi_getinfo(bi_info* infostruct){
  int i, l;
  char buf[200], *s;
  int events[10];

  init_global_vars();

	
  /*infostruct->kernelstring=bi_strdup("Random Memory Access");*/
  infostruct->codesequence=bi_strdup("for i=1,N#  var=memory[random(0..size)]#");
  infostruct->xaxistext=bi_strdup("Accessed Memory in Byte");
  
  infostruct->numfunctions= 1+ NUM_COUNTERS;
  infostruct->maxproblemsize=nMeasurements;
  infostruct->outlier_direction_upwards=malloc(infostruct->numfunctions*sizeof(int));
  
  for (i=0; i< infostruct->numfunctions; i++)
  		infostruct->outlier_direction_upwards[i]=1;
		
  infostruct->log_xaxis=1;
  infostruct->base_xaxis=2.0;
  infostruct->log_yaxis=malloc(infostruct->numfunctions*sizeof(int));
  infostruct->log_yaxis[0]=0;
  infostruct->base_yaxis=malloc(infostruct->numfunctions*sizeof(double));
  infostruct->base_yaxis[0]=0.0;
  infostruct->legendtexts=malloc(infostruct->numfunctions*sizeof(char*));
  
  infostruct->legendtexts[0]=bi_strdup("Average Access Time");
	if ( PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) {
		printf("PAPI library init error!\n");
		exit(127);
	}
 	for (i=0; i< NUM_COUNTERS; i++) {
		PAPI_event_name_to_code(sCounters[i],&(events[i]));
		infostruct->legendtexts[i+1] = sCounters[i];
		printf("%s -> %lx\n", sCounters[i],events[i]);
  }
	if (PAPI_start_counters(events, NUM_COUNTERS) != PAPI_OK) {
		printf("PAPI error: start_counters");
		exit(127);
	}
	  
  
  infostruct->yaxistexts=malloc(infostruct->numfunctions*sizeof(char*));
  if (infostruct->legendtexts==0){
    printf("No more core\n");
    exit(127);
  }
  infostruct->yaxistexts[0]=bi_strdup("s");
  infostruct->yaxistexts[1]=bi_strdup("");

}
Beispiel #9
0
int test_start()
{
    res_file = fopen(res_file_name, "w");
    if (res_file == NULL) {
        printf("Error opening file %s\n", res_file_name);
        return -1;
    }


    if ( PAPI_start_counters( Events, 2 ) != PAPI_OK )
        return -1;
    else
        return 0;
}
void *ThreadIO(void *arg) {
  unsigned long tid = (unsigned long)pthread_self();
  if (!TESTS_QUIET) printf("\nThread %#lx: will read %s and write it to /dev/null\n", tid,(const char*) arg);
  int Events[NUM_EVENTS]; 
  long long values[NUM_EVENTS];
  int retval;
  int e;
  for (e=0; e<NUM_EVENTS; e++) {
    retval = PAPI_event_name_to_code((char*)names[e], &Events[e]);
    if (retval != PAPI_OK) {
      fprintf(stderr, "Error getting code for %s\n", names[e]);
      exit(2);
    } 
  }

  /* Start counting events */
  if (PAPI_start_counters(Events, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_start_counters\n");
    exit(1);
  }
 
//if (PAPI_read_counters(values, NUM_EVENTS) != PAPI_OK)
//   handle_error(1);
//printf("After reading the counters: %lld\n",values[0]);

  int fdin = open((const char*)arg, O_RDONLY);
  if (fdin < 0) perror("Could not open file for reading: \n");

  int bytes = 0;
  char buf[1024];

  int fdout = open("/dev/null", O_WRONLY);
  if (fdout < 0) perror("Could not open /dev/null for writing: \n");
  while ((bytes = read(fdin, buf, 1024)) > 0) {
    write(fdout, buf, bytes);
  }
  close(fdout);

  /* Stop counting events */
  if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK) {
    fprintf(stderr, "Error in PAPI_stop_counters\n");
  }

  if (!TESTS_QUIET) {
    for (e=0; e<NUM_EVENTS; e++)  
      printf("Thread %#lx: %s: %lld\n", tid, names[e], values[e]);
  }
  return(NULL);
}
Beispiel #11
0
int main(int argc, char *argv[]) {

     double a[MAXVSIZE], b[MAXVSIZE], c[MAXVSIZE];
     int i,n;
     long long before, after;


     if (PAPI_VER_CURRENT != 
		PAPI_library_init(PAPI_VER_CURRENT))
	ehandler("PAPI_library_init error.");

     const size_t EVENT_MAX = PAPI_num_counters();
        printf("# Max counters = %zd\n", EVENT_MAX);

     if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS))
	        ehandler("Cannot count PAPI_TOT_INS.");

     if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS))
	        ehandler("Cannot count PAPI_FP_OPS.");

     size_t EVENT_COUNT = 2;
     int events[] = { PAPI_TOT_INS, PAPI_FP_OPS };
     long long values[EVENT_COUNT];


     printf("Enter vector size:  ");
     scanf("%d",&n);

     for (i=0;i<n;i++) {
       a[i] = i;
       b[i] = n-i;
     }

     PAPI_start_counters(events, EVENT_COUNT);

     if(PAPI_OK != PAPI_read_counters(values, EVENT_COUNT))
               ehandler("Problem reading counters.");

     loop(c,a,b,n);

     if(PAPI_OK != PAPI_read_counters(values, EVENT_COUNT))
               ehandler("Problem reading counters.");

     printf("Number of instructions = %lld\n",values[0]);
     printf("Number of fp operations = %lld\n",values[1]);
     return 0;

}
int main () 
{
    float t0, t1;
    int iter, i, j;
    int events[2] = {PAPI_L1_DCM, PAPI_FP_OPS }, ret;
    long_long values[2];

    if (PAPI_num_counters() < 2) {
        fprintf(stderr, "No hardware counters here, or PAPI not supported.\n");
        exit(1);
    }
    for (i = 0; i < MX; i++) {
        if ((ad[i] = malloc(sizeof(double)*MX)) == NULL) {
            fprintf(stderr,"malloc failed\n");
            exit(1);
        }
    }
    for (j = 0; j < MX; j++) { 
        for (i = 0; i < MX; i++) {
            ad[i][j] = 1.0/3.0; /* Initialize the data */
        }
    }
    t0 = gettime();
    if ((ret = PAPI_start_counters(events, 2)) != PAPI_OK) {
        fprintf(stderr, "PAPI failed to start counters: %s\n", PAPI_strerror(ret));
        exit(1);
    }
    for (iter = 0; iter < NITER; iter++) {
        for (j = 0; j < MX; j++) {
            for (i = 0; i < MX; i++) {
                ad[i][j] += ad[i][j] * 3.0;
            }
        }
    }
    if ((ret = PAPI_read_counters(values, 2)) != PAPI_OK) {
        fprintf(stderr, "PAPI failed to read counters: %s\n", PAPI_strerror(ret));
        exit(1);
    }
    t1 = gettime();

    printf("Total software flops = %f\n",(float)TOT_FLOPS);
    printf("Total hardware flops = %lld\n",(float)values[1]);
    printf("MFlop/s = %f\n", (float)(TOT_FLOPS/MEGA)/(t1-t0));
    printf("L1 data cache misses is %lld\n", values[0]);
}
Beispiel #13
0
JNIEXPORT jint JNICALL Java_papi_Wrapper_startCounters
		(JNIEnv *env, jobject UNUSED_ARG(self), jintArray eventsarr) {
	if (eventsarr == NULL) {
		return PAPI_EINVAL;
	}

	int events_count = (*env)->GetArrayLength(env, eventsarr);
	if (events_count == 0) {
		return PAPI_EINVAL;
	}

	jint *eventsj = (*env)->GetIntArrayElements(env, eventsarr, NULL);
	int *events = (int *) eventsj;

	int rc = PAPI_start_counters(events, events_count);

	(*env)->ReleaseIntArrayElements(env, eventsarr, eventsj, JNI_ABORT);

	return rc;
}
Beispiel #14
0
int main()
{
  //this will fail if some counters can't be accessed
  if (PAPI_start_counters(papi_events, n_papi_events) != PAPI_OK)
    {
      printf("failed to start papi\n");
      return 1;
    }

  doWork(123);

  if (PAPI_read_counters(papi_values[0], n_papi_events) != PAPI_OK)
    {
      printf("failed to read countess\n");
      return 1;
    }

  printf("counters' values: misses = %d, accesses = %d\n", papi_values[0][0], papi_values[0][1]);

  return 0;
}
int main(int argc, char **argv) {

	int events[1];
	long long counts[1];

	int retval,quiet;

	char test_string[]="Testing PAPI_SYC_INS predefined event...";

	quiet=test_quiet();

	retval = PAPI_library_init(PAPI_VER_CURRENT);
	if (retval != PAPI_VER_CURRENT) {
		if (!quiet) printf("Error! PAPI_library_init %d\n",retval);
		test_fail(test_string);
	}

	retval = PAPI_query_event(PAPI_SYC_INS);
	if (retval != PAPI_OK) {
		if (!quiet) printf("PAPI_SYC_INS not available\n");
		test_skip(test_string);
	}

	events[0]=PAPI_SYC_INS;

	PAPI_start_counters(events,1);

	PAPI_stop_counters(counts,1);

	if (counts[0]<1) {
		if (!quiet) printf("Error! Count too low\n");
		test_fail(test_string);
	}

	PAPI_shutdown();

	test_unimplemented(test_string);

	return 0;
}
static void start_sssp(FibHeap<size_t, size_t> *pq,
                       vertex_t *graph)
{

#ifdef PAPI
    if (PAPI_OK != PAPI_start_counters(g_events, G_EVENT_COUNT)) {
        std::cout << ("Problem starting counters 1.\n");
    }
#endif


    while (!pq->empty()) {
        size_t distance;
        size_t node;
        pq->pop(distance, node);
        vertex_t *v = &graph[node];
        size_t v_dist = v->distance;
        for (size_t i = 0; i < v->num_edges; i++) {
            const edge_t *e = &v->edges[i];
            const size_t new_dist = v_dist + e->weight;
            vertex_t *w = &graph[e->target];
            size_t w_dist = w->distance;

            if (new_dist < w_dist) {
                w->distance = new_dist;
                if (w->n == NULL) {
                    w->n = pq->push(new_dist, e->target);
                } else {
                    pq->decrease_key(w->n, new_dist);
                }
            }
        }
    }
#ifdef PAPI
    if (PAPI_OK != PAPI_read_counters(g_values[0], G_EVENT_COUNT)) {
        std::cout << ("Problem reading counters 2.\n");
    }
#endif
}
Beispiel #17
0
void
papi_set_events(char *metric)
{
  const size_t n = 1;

  int max;
  long_long *papi_tmp;
  int papi_events[1];
  int code;

  max = PAPI_num_counters();

  if (n > max)
    papi_eprintf("Too many counters requested.\n");

  papi_tmp = malloc(sizeof(*papi_tmp) * n);

  PAPI_reset(max);

  PAPI_stop_counters(papi_tmp, n);

  if (PAPI_event_name_to_code(metric, &code) != PAPI_OK)
    papi_eprintf("Unknown PAPI event %s.\n", metric);

  if (code == 0)
    papi_eprintf("Unknown PAPI event %s.\n", metric);

  papi_events[0] = code;

  PAPI_start_counters(papi_events, n);

  if (PAPI_read_counters(papi_tmp, n) != PAPI_OK)
    papi_eprintf("Problem reading counters %s:%d.\n", __FILE__, __LINE__);

  free(papi_tmp);
}
Beispiel #18
0
int main(int argc, char **argv) {

   int retval,quiet,result;

   int num_runs=100;
   long long high=0,low=0,average=0,expected=1000000;
   double error;
   int num_random_branches=500000;

   int i;
   int events[1];
   long long counts[1],total=0;

   char test_string[]="Testing PAPI_BR_PRC predefined event...";

   quiet=test_quiet();

   retval = PAPI_library_init(PAPI_VER_CURRENT);
   if (retval != PAPI_VER_CURRENT) {
      if (!quiet) printf("Error: PAPI_library_init %d\n", retval);
      test_fail(test_string);
   }

   retval = PAPI_query_event(PAPI_BR_PRC);
   if (retval != PAPI_OK) {
      if (!quiet) printf("PAPI_BR_PRC not supported %d\n", retval);
      test_skip(test_string);
   }

   if (!quiet) {
      printf("\n");
      printf("Testing a simple loop with %lld branches (%d times):\n",
          expected,num_runs);
      printf("Nearly all the branches should be predicted correctly.\n");
   }

   events[0]=PAPI_BR_PRC;
   high=0;
   low=0;

   for(i=0;i<num_runs;i++) {

     PAPI_start_counters(events,1);

     result=branches_testcode();

     PAPI_stop_counters(counts,1);

     if (result==CODE_UNIMPLEMENTED) {
       if (!quiet) printf("\tNo test code for this architecture\n");
       test_skip(test_string);
     }

     if (counts[0]>high) high=counts[0];
     if ((low==0) || (counts[0]<low)) low=counts[0];
     total+=counts[0];
   }

   average=total/num_runs;

   error=display_error(average,high,low,expected,quiet);

   if ((error > 1.0) || (error<-1.0)) {

      if (!quiet) printf("Instruction count off by more than 1%%\n");
      test_fail(test_string);

   }
   if (!quiet) printf("\n");

   /*******************/

   high=0; low=0; total=0;

   events[0]=PAPI_BR_CN;

   for(i=0;i<num_runs;i++) {

     PAPI_start_counters(events,1);

     result=random_branches_testcode(num_random_branches,1);

     PAPI_stop_counters(counts,1);

     if (counts[0]>high) high=counts[0];
     if ((low==0) || (counts[0]<low)) low=counts[0];
     total+=counts[0];
   }

   average=total/num_runs;

   expected=average;

   if (!quiet) {
      printf("\nTesting a function that branches based on a random number\n");
      printf("   The loop has %lld conditional branches.\n",expected);
      printf("   %d are random branches; %d of those were taken\n",num_random_branches,result);
   }

   high=0; low=0; total=0;

   events[0]=PAPI_BR_PRC;

   for(i=0;i<num_runs;i++) {

     PAPI_start_counters(events,1);

     result=random_branches_testcode(num_random_branches,1);

     PAPI_stop_counters(counts,1);

     if (counts[0]>high) high=counts[0];
     if ((low==0) || (counts[0]<low)) low=counts[0];
     total+=counts[0];
   }

   average=total/num_runs;

   if (!quiet) {

      printf("\nOut of %lld branches, %lld predicted correctly\n",expected,average);
      printf("Assuming a good random number generator and no freaky luck\n");
      printf("The TOTAL - CORRECT value is %lld\n",expected-average);
      printf("This value should be roughly between %d and %d\n",
             num_random_branches/4,(num_random_branches/4)*3);
   }

   if ( (expected-average) < (num_random_branches/4)) {
     if (!quiet) printf("Correct predicts too low\n");
     test_fail(test_string);
   }

   if ( (expected-average) > (num_random_branches/4)*3) { 

     if (!quiet) printf("Correct predicts too high\n");
     test_fail(test_string);
   }
   if (!quiet) printf("\n");

   PAPI_shutdown();

   test_pass(test_string);

   return 0;
}
Beispiel #19
0
int main()
{
 
    int retval;
    int i,j;
    int EventSet = PAPI_NULL;
    long long totales[EVENT_COUNT], totalesPerm[EVENT_COUNT];
    
    int events[] = {PAPI_L1_DCM, PAPI_L1_DCH, PAPI_L1_DCA, PAPI_L2_DCH, PAPI_L2_DCA};
    long long values[EVENT_COUNT];
    
    
    // Inicializamos la librería PAPI
    retval = PAPI_library_init(PAPI_VER_CURRENT);
    
    if(retval!=PAPI_VER_CURRENT){
	fprintf(stderr, "PAPI library init error!\n");
	exit(1);
    }
    
    
    //Comprobamos si los contadores están disponibles
    for(i=0; i<EVENT_COUNT; i++)
    {
      if (PAPI_OK != PAPI_query_event(events[i])) 
      {
	printf("Cannot count counter %d", i);
	exit(0);
      }
    }

    //iniciamos los vectores de resultados totales
    for(i=0; i<EVENT_COUNT; i++)
    {
      totales[i]=0;
      totalesPerm[i]=0;
    }
    
    
    //iniciarMatrizB();

    printf("\n --------  Prueba con bucle original ---------\n\n");
  
    for(i=0; i<TEST_NUM; i++)
    {
	//Iniciamos la cuenta de eventos
	if (PAPI_start_counters(events, EVENT_COUNT) != PAPI_OK)
	{
	    fprintf(stderr, "ERROR Starting counters!\n");
	    exit(1);
	}
	
	bucle();
	
	//Leemos el valor de un contador:
	if (PAPI_stop_counters(values, EVENT_COUNT) != PAPI_OK)
	{
	    fprintf(stderr, "ERROR Reading counters!\n");
	    exit(1);
	}
	
	for(j=0; j<EVENT_COUNT; j++)
	{
	  totales[j]+=values[j];      
	}

	printf("Prueba %d:\n\tL1 -> Accesos: %lld  Aciertos: %lld  Fallos: %lld\n", i, values[2], values[1], values[0]);
	printf("\tL2 -> Accesos: %lld  Aciertos: %lld\n",  values[4], values[3]);
    }
    
    //Calculamos los valores medios:
    for(i=0; i<EVENT_COUNT; i++)
    {
      totales[i] = totales[i]/TEST_NUM;
    }
    
    printf("\nValores medios:\n");
    printf("\tCaché L1:\n\t\tAccesos: %lld  \n\t\tAciertos: %lld  \n\t\tFallos: %lld \n\t\tPorcentaje de acierto: %lld\n",  
	   totales[2], totales[1], totales[0], 100*totales[1]/totales[2]);
    printf("\tCaché L2:\n\t\tAccesos: %lld  \n\t\tAciertos: %lld  \n\t\tPorcentaje de acierto: %lld\n",
	   totales[4], totales[3], 100*totales[3]/(totales[4]));
    
    
    
    
    printf("\n --------  Prueba con bucle permutado --------- \n\n");

    for(i=0; i<TEST_NUM; i++)
    {
      
	//Iniciamos la cuenta de eventos
	if (PAPI_start_counters(events, EVENT_COUNT) != PAPI_OK)
	{
	    fprintf(stderr, "ERROR Starting counters!\n");
	    exit(1);
	}
	
	
	buclePermutado();
	
	//Leemos el valor de un contador:
	if (PAPI_stop_counters(values, EVENT_COUNT) != PAPI_OK)
	{
	    fprintf(stderr, "ERROR Reading counters!\n");
	    exit(1);
	}
      
      	for(j=0; j<EVENT_COUNT; j++)
	{
	  totalesPerm[j]+=values[j];      
	}
	
	printf("Prueba %d:\n\tL1 -> Accesos: %lld  Aciertos: %lld  Fallos: %lld\n", i, values[2], values[1], values[0]);
	printf("\tL2 -> Accesos: %lld  Aciertos: %lld\n",  values[4], values[3]);

    }

    //Calculamos los valores medios:
    for(i=0; i<EVENT_COUNT; i++)
    {
      totalesPerm[i] = totalesPerm[i]/TEST_NUM;
    
    }
    

    
    printf("\nValores medios:\n");
    printf("\tCaché L1:\n\t\tAccesos: %lld  \n\t\tAciertos: %lld  \n\t\tFallos: %lld \n\t\tPorcentaje de acierto: %lld\n",  
	   totalesPerm[2], totalesPerm[1], totalesPerm[0], 100*totalesPerm[1]/totalesPerm[2]);
    printf("\tCaché L2:\n\t\tAccesos: %lld  \n\t\tAciertos: %lld  \n\t\tPorcentaje de acierto: %lld\n",
	   totalesPerm[4], totalesPerm[3], 100*totalesPerm[3]/(+totalesPerm[4]));
    
    
    return 0;
    
}
Beispiel #20
0
void* Thread(void *userData) {

    ThreadInfo *info = (ThreadInfo*) userData;
    Context *c = info->c;

    int index = info->index;
    int threadCount = c->threadCount;
    int64_t repetitionCount = c->repetitionCount;

    uint64_t me = 0x1 << index;
    uint64_t full = 0x0000000000000000;

    uint64_t copy; //thread local copy of the entry/exit barrier

    for (int i = 0; i < threadCount; ++i) {
        full |= 0x1 << i;
    }

    // set thread affinity
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    CPU_SET(index, &cpuset);
    assert(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) == 0);

    //DEBUG
    //pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
    //printf("%i uses cpus: ", index);
    //for (int i = 0; i < threadCount; ++i) {
    //    if (CPU_ISSET(i, &cpuset)) {
    //        printf("%i, ", i);
    //    }
    //}
    //printf("\n");


    int threadToBeRecorded = -1;
    int papiEvents[3] = {0x8000003b, 0x80000000, 0x80000002};
    long long papiStart[3] = {0, 0, 0};
    long long papiEnd[3] = {0, 0, 0};

    if (index == threadToBeRecorded) {
        int ret = PAPI_start_counters(papiEvents, 3);
        if (ret != 0) {
            printf("thread %i: PAPI_start_counters %i\n", index, ret);
            assert(0);
        }
        ret = PAPI_read_counters(papiStart, 3);
        if (ret != 0) {
            printf("thread %i: PAPI_read_counters %i\n", index, ret);
            assert(0);
        }
    }


    //unlink("a");
    //FILE *log = fopen("a", "a");

    for(int64_t repetition = 0; repetition < repetitionCount; repetition++){

        if (c->left == 0) { /* *** if () { UNIFIED ENTRY *********************/

            /* run to wall and wait busily */
            do {
                copy = c->entry;
                //fprintf(log, "%i r %lli\n", prime, (long long) copy);
                //fflush(log);
                if ((copy & me) == 0) {
                    copy |= me;
                    c->entry = copy;
                    //fprintf(log, "%i w %lli\n", prime, (long long) copy);
                    //fflush(log);
                }
            }while (copy != full && c->left == 0);

            c->left = 1;

            c->exit = 0x0000000000000000;

        } else if (c->left == 1) { /* *** } else if () { UNIFIED ENTRY *******/

            for (int i = 0; i < threadCount - 1; ++i) {
                if (c->successfulBarrierVisitsCount[i] != c->successfulBarrierVisitsCount[i+1]) {
                    printf("thread %i and %i are not equal at %lli %lli\n", i, i+1,
                            (long long)c->successfulBarrierVisitsCount[i],
                            (long long)c->successfulBarrierVisitsCount[i+1]);
                    ++c->outOfSyncCount;
                    assert(0);
                }
            }

            /* wait busily until everyone has left the barrier */
            do {
                copy = c->exit;
                if ((copy & me) == 0) {
                    copy |= me;
                    c->exit = copy;
                }
            }while (copy != full && c->left == 1);

            c->left = 0;

            c->entry = 0x0000000000000000;

            ++(c->successfulBarrierVisitsCount[index]);

        } /* *** } UNIFIED ENTRY *********************************************/
    }

    if (index == threadToBeRecorded) {
        int ret = PAPI_stop_counters(papiEnd, 3);
        if (ret != 0) {
            printf("%i: PAPI_stop_counters %i\n", index, ret);
            assert(0);
        }
        printf("thread %i: papi counter 0: %lli - %lli = %lli\n", index, papiEnd[0], papiStart[0], papiEnd[0] - papiStart[0]);
        printf("thread %i: papi counter 1: %lli - %lli = %lli\n", index, papiEnd[1], papiStart[1], papiEnd[1] - papiStart[1]);
        printf("thread %i: papi counter 2: %lli - %lli = %lli\n", index, papiEnd[2], papiStart[2], papiEnd[2] - papiStart[2]);
        printf("\n");
    }

    return NULL;
}
Beispiel #21
0
void* sssp(void *data) {
  thread_data_t *d = (thread_data_t *)data;

  /* Create transaction */
  set_cpu(the_cores[d->id]);
  /* Wait on barrier */
  ssalloc_init();
  PF_CORRECTION;

  seeds = seed_rand();

#ifdef PIN
  int id = d->id;
  // int cpu = 40*(id/40) + 4*(id%10) + (id%40)/10;
  int cpu = 4*(id%20) + id/20; 
  // printf("Pinning %d to %d\n",id,cpu);
  pin(pthread_self(), cpu);
  //  pin(pthread_self(), id);
#endif

 #ifdef PAPI
    if (PAPI_OK != PAPI_start_counters(g_events, G_EVENT_COUNT))
  {
    printf("Problem starting counters 1.");
  }
 #endif


  barrier_cross(d->barrier);

  // Begin SSSP

  int fail = 0;
  // int radius = 0;
  while (1) {
    val_t node;
    slkey_t dist_node;
  //   print_skiplist(d->set);
    while (1) { 
     if (d->sl) {
       if (spray_delete_min_key(d->set, &dist_node, &node, d)) break; // keep trying until get a node
     } else if (d->pq) {
       if (lotan_shavit_delete_min_key(d->set, &dist_node, &node, d)) break;
     } else if (d->lin) {
       node = (val_t) deletemin_key(d->linden_set, &dist_node, d); break;
     } else {
       printf("error: no queue selected\n");
       exit(1); // TODO: grace
     }
     if (dist_node == -1) { // flag that list is empty
       break;
     }
     dist_node = 0;
    }
    if (dist_node == -1) { // list is empty; TODO make sure threads don't quit early
      fail++;
      if (fail > 20*d->nb_threads) { // TODO: really need a better break condition...
        break;
      }
      continue;
    }
    fail = 0;
    if (dist_node != nodes[node].dist) continue; // dead node
    nodes[node].times_processed++;

    int i;
    for (i = 0;i < nodes[node].deg;i++) {
      int v = nodes[node].adj[i];
      int w = nodes[node].weights[i];
      slkey_t dist_v = nodes[v].dist;
  //  printf("v=%d dist_v=%d\n", v, dist_v);
      if (dist_v == -1 || dist_node + w < dist_v) { // found better path to v
  //       printf("attempting cas...\n");
  //       printf("nodes[v].dist=%d dist_v=%d dist_node=%d\n", nodes[v].dist, dist_v, dist_node);
        int res = ATOMIC_CAS_MB(&nodes[v].dist, dist_v, dist_node+w);
  //       printf("%d nodes[%d].dist=%d\n", res, v, nodes[v].dist);
        if (res) {
          if (d->pq || d->sl) {
            sl_add_val(d->set, dist_node+w, v, TRANSACTIONAL); // add to queue only if CAS is successful
          } else if (d->lin) {
            insert(d->linden_set, dist_node+w, v);
          }
          d->nb_add++;
  //         if (dist_node+1 > radius) {
  //           radius = dist_node+1;
  //           printf("radius %d\n", radius);
  //         }
        }
      } 
    }
  }

  // End SSSP
  
#ifdef PAPI
  if (PAPI_OK != PAPI_read_counters(g_values[d->id], G_EVENT_COUNT))
  {
    printf("Problem reading counters 2.");
  }
#endif

  PF_PRINT;

  return NULL;
}
Beispiel #22
0
int main(int argc, char *argv[])
{
  int size, rank, world_rank, my_group;
  int num_lsms; // number of parallel LSMS instances
  int size_lsms; // number of atoms in a lsms instance
  int num_steps; // number of energy calculations
  int initial_steps; // number of steps before sampling starts
  int stepCount=0; // count the Monte Carlo steps executed
  double max_time; // maximum walltime for this run in seconds
  bool restrict_time = false;       // was the maximum time specified?
  bool restrict_steps = false; // or the max. numer of steps?
  int align; // alignment of lsms_instances
  
  double magnetization;
  double energy_accumulator; // accumulates the enegy to calculate the mean
  int energies_accumulated;


  int new_peid,new_root;
  static int op,flag;
  double *evec,*r_values;
  evec=(double *)shmalloc(sizeof(double)*3*size_lsms);
  r_values=(double *)shmalloc(sizeof(double)*(R_VALUE_OFFSET+3*(size_lsms+1)));




  energy_accumulator=0.0;
  energies_accumulated=0;

  double walltime_0,walltime;

  double restartWriteFrequency=30.0*60.0;
  double nextWriteTime=restartWriteFrequency;

  MPI_Comm local_comm;
  int *lsms_rank0;
  MPI_Status status;

  char prefix[40];
  char i_lsms_name[64];
  char gWL_in_name[64], gWL_out_name[64];
  char mode_name[64];
  char energy_calculation_name[64];
  char stupid[37];

  char step_out_name[64];
  char wl_step_out_name[128];
  char *wl_stepf=NULL;
  bool step_out_flag=false;
  std::ofstream step_out_file;
  typedef enum {Constant, Random, WangLandau_1d, ExhaustiveIsing, WangLandau_2d} EvecGenerationMode;
  typedef enum {MagneticMoment, MagneticMomentZ, MagneticMomentX, MagneticMomentY} SecondDimension;

  EvecGenerationMode evec_generation_mode = Constant;
  SecondDimension second_dimension = MagneticMoment;
  double ev0[3];

  bool return_moments_flag=true; // true-> return all magnetic moments from lsms run at each step.
  bool generator_needs_moment=false;

  typedef enum {OneStepEnergy, MultiStepEnergy, ScfEnergy} EnergyCalculationMode;
  EnergyCalculationMode energyCalculationMode = OneStepEnergy;
  int energyIndex=1; // index for the return value to use for the MC step (0: total energy, 1: band energy)

  ev0[0]=ev0[1]=0.0; ev0[2]=1.0;
  // size has to be align + size_lsms*num_lsms
  align=1;
  num_lsms=1;
  size_lsms=-1;
  my_group=-1;
  num_steps=1;
  initial_steps=0;

  sprintf(i_lsms_name,"i_lsms");
  gWL_in_name[0]=gWL_out_name[0]=0;
  mode_name[0]=0;
  energy_calculation_name[0]=0;

  // check command line arguments
  for(int i=0; i<argc; i++)
  {
    if(!strcmp("-num_lsms",argv[i])) num_lsms=atoi(argv[++i]);
    if(!strcmp("-size_lsms",argv[i])) size_lsms=atoi(argv[++i]);
    if(!strcmp("-align",argv[i])) align=atoi(argv[++i]);
    if(!strcmp("-num_steps",argv[i])) {num_steps=atoi(argv[++i]); restrict_steps=true;}
    if(!strcmp("-initial_steps",argv[i])) initial_steps=atoi(argv[++i]); 
    if(!strcmp("-walltime",argv[i])) {max_time=60.0*atof(argv[++i]); restrict_time=true;}
    if(!strcmp("-i",argv[i])) strncpy(i_lsms_name,argv[++i],64);
    if(!strcmp("-random_dir",argv[i])) {evec_generation_mode = Random;}
    if(!strcmp("-step_out",argv[i]))
    {strncpy(step_out_name,argv[++i],64); step_out_flag=true;
      return_moments_flag=true;}
    if(!strcmp("-wl_out", argv[i])) strncpy(gWL_out_name,argv[++i],64);
    if(!strcmp("-wl_in", argv[i])) strncpy(gWL_in_name,argv[++i],64);
    if(!strcmp("-mode", argv[i])) strncpy(mode_name,argv[++i],64);
    if(!strcmp("-energy_calculation",argv[i])) strncpy(energy_calculation_name,argv[++i],64);
  }

  if(!(restrict_steps || restrict_time)) restrict_steps=true;

  if(mode_name[0]!=0)
  {
    if(!strcmp("constant",mode_name)) evec_generation_mode = Constant;
    if(!strcmp("random",mode_name)) evec_generation_mode = Random;
    if(!strcmp("1d",mode_name)) evec_generation_mode = WangLandau_1d;
    if(!strcmp("ising",mode_name)) evec_generation_mode = ExhaustiveIsing;
    if(!strcmp("2d",mode_name)) evec_generation_mode = WangLandau_2d;
    if(!strcmp("2d-m",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMoment;}
    if(!strcmp("2d-x",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentX;}
    if(!strcmp("2d-y",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentY;}
    if(!strcmp("2d-z",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentZ;}
  }

  if(energy_calculation_name[0]!=0)
  {
    if(energy_calculation_name[0]=='o') { energyCalculationMode = OneStepEnergy; energyIndex=1; }
    if(energy_calculation_name[0]=='m') { energyCalculationMode = MultiStepEnergy; energyIndex=1; }
    if(energy_calculation_name[0]=='s') { energyCalculationMode = ScfEnergy; energyIndex=0; }
  }

#ifdef USE_PAPI
#define NUM_PAPI_EVENTS 4
  int hw_counters = PAPI_num_counters();
  if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS;
  int papi_events[NUM_PAPI_EVENTS]; // = {PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_FP_OPS,PAPI_VEC_INS};
  char *papi_event_name[] = {"PAPI_TOT_INS","PAPI_FP_OPS",
                             "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:OP_TYPE",
                             "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:OP_TYPE"};
  // "RETIRED_INSTRUCTIONS",
  // "RETIRED_MMX_AND_FP_INSTRUCTIONS:PACKED_SSE_AND_SSE2",
  // "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:1",
  // "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:1"
  // get events from names:
  for(int i=0; i<NUM_PAPI_EVENTS; i++)
  {
    if(PAPI_event_name_to_code(papi_event_name[i],&papi_events[i]) != PAPI_OK)
    {
      // printline("Error in obtaining PAPI event code for: "+ttos(papi_event_name[i]),
      //           std::cerr,parameters.myrankWorld);
      // printline("Skipping all following events",
      //           std::cerr,parameters.myrankWorld);
      if(hw_counters>i) hw_counters=i;
    }
  }
  long long papi_values[NUM_PAPI_EVENTS+4];
  // printline("PAPI: "+ttos(hw_counters)+" counters available",std::cout,parameters.myrankWorld);
  if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS;
  long long papi_real_cyc_0 = PAPI_get_real_cyc();
  long long papi_real_usec_0 = PAPI_get_real_usec();
  long long papi_virt_cyc_0 = PAPI_get_virt_cyc();
  long long papi_virt_usec_0 = PAPI_get_virt_usec();
  PAPI_start_counters(papi_events,hw_counters);
#endif


  lsms_rank0=(int *)malloc(sizeof(int)*(num_lsms+1));

  // initialize MPI:
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  world_rank=rank;
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  walltime_0 = get_rtc();

#ifndef SVN_REV
#define SVN_REV "unknown"
#endif

// make sure 'return_moments_flag' is set correctly
  switch(evec_generation_mode)
  {
  case Constant : break;
  case Random : break;
  case WangLandau_1d :
    return_moments_flag = true;
    generator_needs_moment = true;
    break;
  case ExhaustiveIsing : break;
  case WangLandau_2d :
    return_moments_flag = true;
    generator_needs_moment = true;
    break;
  default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1);
  }

  if(rank==0)
  {
    std::cout<<"LSMS_3"<<std::endl;
    std::cout<<" SVN revision "<<SVN_REV<<std::endl<<std::endl;
#ifdef USE_PAPI
    std::cout<<" Using Papi counters"<<std::endl<<std::endl; 
#endif
    std::cout<<" Size of LSMS instances = "<<size_lsms<<" atoms\n";
    std::cout<<" Number of LSMS instances = "<<num_lsms<<std::endl;
    std::cout<<" LSMS Energy calculated using ";
    switch(energyCalculationMode)
    {
    case OneStepEnergy: std::cout<<"oneStepEnergy [frozen potential band energy]"<<std::endl; break;
    case MultiStepEnergy: std::cout<<"multiStepEnergy [frozen potential band energy with converged Fermi energy]"<<std::endl; break;
    case ScfEnergy: std::cout<<"scfEnergy [self-consistent total energy]"<<std::endl; break;
    default: std::cout<<"UNKNOWN ENERGY CALCULATION METHOD"<<std::endl; exit(1);
    }
    if(restrict_steps) std::cout<<" Number of gWL steps = "<<num_steps<<std::endl;
    if(restrict_time) std::cout<<" Maximum walltime = "<<max_time<<"s\n";
    std::cout<<" Processor alignment (process allocation quantization) = "<<align<<std::endl;
    switch(evec_generation_mode)
    {
    case Constant : std::cout<<" Constant moments direction along "
                             <<ev0[0]<<" "<<ev0[1]<<" "<<ev0[2]<<std::endl;
      break;
    case Random : std::cout<<" Random distribution of moments (no Wang-Landau)"<<std::endl;
      break;
    case WangLandau_1d : std::cout<<" Wang-Landau for one continuous variable (energy)"<<std::endl;
//      return_moments_flag = true;
//      generator_needs_moment = true;
      break;
    case ExhaustiveIsing : std::cout<<" Exhaustive Ising sampling"<<std::endl; break;
    case WangLandau_2d : std::cout<<" Wang-Landau for two continuous variable (energy, ";
      switch(second_dimension)
      {
      case MagneticMoment  : std::cout<<"magnitude of magnetization)"; break;
      case MagneticMomentX : std::cout<<"x component of magnetization)"; break;
      case MagneticMomentY : std::cout<<"y component of magnetization)"; break;
      case MagneticMomentZ : std::cout<<"z component of magnetization)"; break;
      }
      std::cout<<std::endl;
//      return_moments_flag = true;
//      generator_needs_moment = true;
      break;
    default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1);
    }
    if(step_out_flag) std::cout<<" Step output written to: "<<step_out_name<<std::endl;
    std::cout<<std::endl;

    if(step_out_flag && (evec_generation_mode==WangLandau_1d))
    {
      // step_out_flag=false;
      snprintf(wl_step_out_name,127,"wl1d_%s",step_out_name);
      wl_stepf=wl_step_out_name;
    }

    if(step_out_flag)
    {
      step_out_file.open(step_out_name);
      step_out_file<<"#";
      for(int i=0; i<argc; i++) step_out_file<<" "<<argv[i];
      step_out_file<<std::endl<<size_lsms<<std::endl;
    }
  }

  if(generator_needs_moment) return_moments_flag=true;

  if(num_lsms==1)
  {
    SHMEM_activeset local_comm;
    local_comm.rank=shmem_my_pe();
    local_comm.size=shmem_n_pes();
    local_comm.start_pe=0;
    local_comm.logPE_stride=0;
    LSMS lsms_calc(local_comm,i_lsms_name,"1_");
      
    if(rank==0)
    {
      std::cout<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n";
      std::cout<<"  LSMS version = "<<lsms_calc.version()<<std::endl;
    }

    if(energyCalculationMode==OneStepEnergy)
      std::cout<<"one step Energy = "<<lsms_calc.oneStepEnergy()<<std::endl;
    else if(energyCalculationMode==MultiStepEnergy)
      std::cout<<"multi-step Energy = "<<lsms_calc.multiStepEnergy()<<std::endl;
    else if(energyCalculationMode==ScfEnergy)
      std::cout<<"self-consistent Energy = "<<lsms_calc.scfEnergy()<<std::endl;
    else
    {
      printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n");
     // MPI_Abort(MPI_COMM_WORLD,5);
      exit(5);
    }
  }
  else
  {
    // build the communicators
    //int color=MPI_UNDEFINED;
    //Assuming user passes a power of two while using "-align"
    int s = align;
    int comm_size=(size-align)/num_lsms;
    int world_rank;
    for(int i=0; i<num_lsms; i++)
    {
      if((world_rank>=s) && (world_rank<s+comm_size)) 
      { 
        my_group=i; 
        //color=i; 
        new_peid=world_rank-s;
        new_root=s;
      }
      lsms_rank0[i]=s;
      s+=comm_size;
    }
    if(world_rank==0){ 
      //color=num_lsms;
      new_peid=0;
      comm_size=1;
      new_root=0;
    }

    //MPI_Comm_split(MPI_COMM_WORLD, color, 0, &local_comm);
    SHMEM_activeset local_comm;
    local_comm.rank=new_peid;
    local_comm.size=comm_size;
    local_comm.start_pe=new_root;
    local_comm.logPE_stride=0;

    std::cout<<"world_rank="<<world_rank<<" -> group="<<my_group<<std::endl;

      
    snprintf(prefix,38,"Group %4d: ",my_group);

    // now we get ready to do some calculations...

    if(my_group>=0)
    {
      double energy;
      double band_energy;
      int static i_values[10];
      double static r_values[10];
      static int op;


      //MPI_Comm_rank(local_comm, &rank);
      rank = local_comm.rank;
      snprintf(prefix,38,"%d_",my_group);
      // to use the ramdisk on jaguarpf:
      // snprintf(prefix,38,"/tmp/ompi/%d_",my_group);
      LSMS lsms_calc(local_comm,i_lsms_name,prefix);
      snprintf(prefix,38,"Group %4d: ",my_group);

      if(rank==0 && my_group==0)
      {
        std::cout<<prefix<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n";
        std::cout<<prefix<<"  LSMS version = "<<lsms_calc.version()<<std::endl;
      }

      // wait for commands from master
      bool finished=false;
      while(!finished)
      {
        if(rank==0)
        {
          //MPI_Recv(evec,3*size_lsms,MPI_DOUBLE,0,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          //op =status.MPI_TAG;
          if (lsms_rank0[0]==world_rank)
                shmem_barrier(0, lsms_rank0[0], 2, pSync1);

        }
        //MPI_Bcast(&op,1,MPI_INT,0,local_comm);
        shmem_broadcast32(&op, &op, 1, local_comm.start_pe, local_comm.start_pe, local_comm.logPE_stride, local_comm.size, pSync2); 

/* recognized opcodes:
   5: calculate energy

   recognized energy calculation modes:
   OneStepEnergy : calclulate frozen potential band energy in one step (don't converge Ef)
   use only if the Fermi energy will not change due to MC steps!
   The only method available in LSMS_1.9
   MultiStepEnergy : calculate frozen potential band energy after converging Fermi energy
   This should be the new default method. If the Fermi energy doesn't change
   multiStepEnergy only performs one step and should be equivalent to oneStepEnergy
   The tolerance for Ef convergence can be set with LSMS::setEfTol(Real).
   The default tolerance is set in the LSMS::LSMS constructor (currently 1.0e-6).
   The maximum number of steps is read from the LSMS input file 'nscf' parameter.
   ScfEnergy : this will calculate the selfconsistent total energy.
   The maximum number of steps is read from the LSMS input file 'nscf' parameter.
   NOT IMPLEMENTED YET!!!

   10: get number of sites
*/

        if(op==5)
        {
          lsms_calc.setEvec(evec);
          if(energyCalculationMode==OneStepEnergy)
            energy=lsms_calc.oneStepEnergy(&band_energy);
          else if(energyCalculationMode==MultiStepEnergy)
            band_energy=energy=lsms_calc.multiStepEnergy();
          else if(energyCalculationMode==ScfEnergy)
            energy=lsms_calc.scfEnergy(&band_energy);
          else
          {
            printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n");
            //MPI_Abort(MPI_COMM_WORLD,5);
            exit(5);
          }
          r_values[0]=energy;
          r_values[1]=band_energy;
          if(return_moments_flag)
          {
            lsms_calc.getMag(&r_values[R_VALUE_OFFSET]);
          }
          if(rank==0)
          {
            if(return_moments_flag)
            {
              //MPI_Send(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,0,1005,MPI_COMM_WORLD);
              shmem_double_put(r_values, r_values, R_VALUE_OFFSET+3*size_lsms, 0);

            } else {
              //MPI_Send(r_values,R_VALUE_OFFSET,MPI_DOUBLE,0,1005,MPI_COMM_WORLD);
              shmem_double_put(r_values, r_values, R_VALUE_OFFSET, 0);
            }
            shmem_fence();
            shmem_int_swap(&flag, world_rank, 0);

          }
              
        } else if(op==10) {
          i_values[0]=lsms_calc.numSpins();
          //MPI_Send(i_values,10,MPI_INT,0,1010,MPI_COMM_WORLD);
          shmem_int_put(i_values, i_values, 10, 0);
        } else {
          // printf("world rank %d: recieved exit\n",world_rank); 
          finished=true;
        }
      }

      shfree(evec);
      //shfree(r_values);
    }
    else if(world_rank==0)
    {
      int running;
      double **evecs;
      //double *r_values;
      //int i_values[10];
      int *init_steps;
      int total_init_steps;
      bool accepted;
        
      char *wl_inf=NULL;
      char *wl_outf=NULL;
      if(gWL_in_name) wl_inf=gWL_in_name;
      if(gWL_out_name) wl_outf=gWL_out_name;
        
      EvecGenerator *generator;

/*
      // get number of spins from first LSMS instance
      // temp r_values:
      r_values=(double *)malloc(sizeof(double)*10);
      MPI_Send(r_values,1,MPI_DOUBLE, lsms_rank0[0], 10, MPI_COMM_WORLD);
      free(r_values);
      MPI_Recv(i_values,10,MPI_INT,lsms_rank0[0],1010,MPI_COMM_WORLD,&status);
      if(i_values[0]!=size_lsms)
      {
        printf("Size specified for Wang-Landau and in LSMS input file don't match!\n");
        size_lsms=i_values[0];
      }
*/

      evecs=(double **)shmalloc(sizeof(double *)*num_lsms);
      init_steps=(int *)shmalloc(sizeof(int)*num_lsms);
      for(int i=0; i<num_lsms; i++)
      {
        evecs[i]=(double *)shmalloc(sizeof(double)*3*size_lsms);
        init_steps[i]=initial_steps;
      }
      total_init_steps=num_lsms*initial_steps;
        

      // Initialize the correct evec generator
      switch(evec_generation_mode)
      {
      case Random :  generator = new RandomEvecGenerator(size_lsms);
        break;
      case Constant: generator = new ConstantEvecGenerator(size_lsms, ev0, num_lsms);
        break;
     //case WangLandau_1d : generator = new WL1dEvecGenerator<std::mt19937>(size_lsms, num_lsms,
     //                                                                      evecs, wl_inf, wl_outf, wl_stepf);
     case WangLandau_1d : generator = new WL1dEvecGenerator<boost::mt19937>(size_lsms, num_lsms,
                                                                           evecs, wl_inf, wl_outf, wl_stepf);
        break;
      case ExhaustiveIsing : generator = new ExhaustiveIsing1dEvecGenerator(size_lsms, num_lsms,
                                                                            evecs, wl_inf, wl_outf);
        break;
      //case WangLandau_2d : generator = new WL2dEvecGenerator<std::mt19937>(size_lsms, num_lsms,
      //                                                                     evecs, wl_inf, wl_outf, wl_stepf);
      case WangLandau_2d : generator = new WL2dEvecGenerator<boost::mt19937>(size_lsms, num_lsms,
                                                                           evecs, wl_inf, wl_outf, wl_stepf);
        break;
      default: std::cerr<<"The code should never arrive here: UNKNOWN EVEC GENERATION MODE\n";
        exit(1);
      }

      for(int i=0; i<num_lsms; i++)
      {
        generator->initializeEvec(i,evecs[i]);
      }
      std::cout<<"This is the master node\n";
      // issue initial commands to all LSMS instances
      running=0;
      bool more_work=true;
      if(total_init_steps>0)
      {
        for(int i=0; i<num_lsms; i++)
        {
          std::cout<<"starting initial calculation in group "<<i<<std::endl;
          //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD);
          shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]);
          shmem_int_p(&op, 5, lsms_rank0[i]);
          shmem_fence();


          num_steps--; running++; stepCount++;
          if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
        }
        shmem_barrier(0, lsms_rank0[0], 2, pSync1);
        // first deal with the initial steps:
        while(running>0)
        {
          //if(return_moments_flag)
          //  MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          //else
          //  MPI_Recv(r_values,R_VALUE_OFFSET,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          
          shmem_int_wait(&flag,-1);

          running--;
          // std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl;
          // std::cout<<"    band energy E_band="<<r_values[1]<<std::endl;
          if(total_init_steps>0)
          {
            //int r_group=(status.MPI_SOURCE-align)/comm_size;
            int r_group=(flag-align)/comm_size;
            std::cout<<"starting additional calculation in group "<<r_group<<std::endl;

            if(init_steps[r_group]>0)
            {
              more_work = !(generator->generateUnsampledEvec(r_group,evecs[r_group],r_values[energyIndex]));
              init_steps[r_group]--; total_init_steps--;
            }
                
            //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD);
            shmem_double_put(r_values, evecs[r_group],  3*size_lsms, lsms_rank0[r_group]); //TODO check this
            shmem_fence();
                
            num_steps--; running++; stepCount++;
            if(restrict_steps && num_steps<=0) more_work=false;
            if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
            walltime = get_rtc() - walltime_0;
            if(restrict_time && walltime>=max_time) more_work=false;
            if(restrict_time) std::cout<<"      "<<max_time-walltime<<" seconds remaining\n";
          }
              
        }
      }
      more_work=true;
      running=0;
      for(int i=0; i<num_lsms; i++)
      {
        std::cout<<"starting main calculation in group "<<i<<std::endl;
        //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD);
        shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]);
        shmem_int_p(&op, 5, lsms_rank0[i]);
        shmem_fence();
        num_steps--; running++; stepCount++;
        if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
      }
      shmem_barrier(0, lsms_rank0[0], 2, pSync1);
        
      generator->startSampling();
      // wait for results and issue new commands or wind down
      while(running>0)
      {
        //MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
        shmem_int_wait(&flag,-1);

        running--;
        std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl;
        std::cout<<"    band energy E_band="<<r_values[1]<<std::endl;
        // printf("from status.MPI_SOURCE=%d\n",status.MPI_SOURCE);
        energy_accumulator+=r_values[0]; energies_accumulated++;
        if(more_work)
        {
          int r_group=(status.MPI_SOURCE-align)/comm_size;
          std::cout<<"starting additional calculation in group "<<r_group<<std::endl;
              
          if(generator_needs_moment)
          {
            double m0,m1,m2;
            m0=0.0; m1=0.0; m2=0.0;
            for(int i=0; i<3*size_lsms; i+=3)
            {
              m0+=r_values[R_VALUE_OFFSET+i];
              m1+=r_values[R_VALUE_OFFSET+i+1];
              m2+=r_values[R_VALUE_OFFSET+i+2];
            }
            switch(second_dimension)
            {
            case  MagneticMoment : magnetization=std::sqrt(m0*m0+m1*m1+m2*m2); break;
            case  MagneticMomentX : magnetization=m0; break;
            case  MagneticMomentY : magnetization=m1; break;
            case  MagneticMomentZ : magnetization=m2; break;
            }
            if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex],magnetization, &accepted))
              more_work=false;
          } else {
            if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex], &accepted)) more_work=false;
          }

          //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD);
          shmem_double_put(r_values, evecs[r_group],  3*size_lsms, lsms_rank0[r_group]); //TODO check this
          shmem_fence();

          num_steps--; running++; stepCount++;
          if(restrict_steps && num_steps<=0) more_work=false;
          if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
          walltime = get_rtc() - walltime_0;
          if(restrict_time && walltime>=max_time) more_work=false;
          if(restrict_time) std::cout<<"      "<<max_time-walltime<<" seconds remaining\n";
        }
        else
        {
          // send an exit message to this instance of LSMS
          int r_group=(status.MPI_SOURCE-align)/comm_size;

          MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 2, MPI_COMM_WORLD);
        }

        if(step_out_flag && accepted)
        {
          step_out_file<<"# iteration "<<energies_accumulated<<std::endl;
          step_out_file.precision(15);
          step_out_file<<energies_accumulated<<std::endl;
          step_out_file<<r_values[0]<<"  "<<r_values[1]<<std::endl;
          for(int j=0; j<3*size_lsms; j+=3)
          {
            step_out_file<<r_values[j+R_VALUE_OFFSET]<<"  "<<r_values[j+R_VALUE_OFFSET+1]
                         <<"  "<<r_values[j+R_VALUE_OFFSET+2]<<std::endl;
          }
        }
        // write restart file every restartWriteFrequency seconds
        if(walltime>nextWriteTime)
        {
          generator->writeState("WLrestart.jsn");
          nextWriteTime+=restartWriteFrequency;
        }

      }
      generator->writeState("WLrestart.jsn");
/*
  if(evec_generation_mode==WangLandau_1d)
  (static_cast<WL1dEvecGenerator<std::mt19937> *>(generator))->writeState("WLrestart.state");
  if(evec_generation_mode==ExhaustiveIsing)
  (static_cast<ExhaustiveIsing1dEvecGenerator *>(generator))->writeState("WLrestart.state");
*/
      for(int i=0; i<num_lsms; i++) free(evecs[i]);
      shfree(evecs);
      //shfree(r_values);
    }
  }

  if(world_rank==0)
  {
    if(step_out_flag)
    {
      step_out_file<<"# end\n-1\n"
                   <<energy_accumulator/double(energies_accumulated)<<std::endl;
      step_out_file.close();
    }
    std::cout<<"Finished all scheduled calculations. Freeing resources.\n";
    std::cout<<"Energy mean = "<<energy_accumulator/double(energies_accumulated)<<"Ry\n";
  }


  if(num_lsms>1)
  {
    // make sure averyone arrives here:
    MPI_Bcast(stupid,37,MPI_CHAR,0,MPI_COMM_WORLD);

    if(world_rank==0)
    {
      MPI_Comm_free(&local_comm);
    }
    else if(my_group>=0)
    {
      MPI_Comm_free(&local_comm);
    }
  }



  if(world_rank==0)
  {
    double walltime = get_rtc() - walltime_0;
    std::cout<<" WL-LSMS finished in "<<walltime<<" seconds.\n";
    std::cout<<" Monte-Carlo steps / walltime = "
             <<double(stepCount)/walltime<<"/sec\n";
  }

#ifdef USE_PAPI
  PAPI_stop_counters(papi_values,hw_counters);
  papi_values[hw_counters  ] = PAPI_get_real_cyc()-papi_real_cyc_0;
  papi_values[hw_counters+1] = PAPI_get_real_usec()-papi_real_usec_0;
  papi_values[hw_counters+2] = PAPI_get_virt_cyc()-papi_virt_cyc_0;
  papi_values[hw_counters+3] = PAPI_get_virt_usec()-papi_virt_usec_0;
  long long accumulated_counters[NUM_PAPI_EVENTS+4];
/*
  for(int i=0; i<hw_counters; i++)
  {
  printline(ttos(papi_event_name[i])+" = "+ttos(papi_values[i]),
  std::cout,parameters.myrankWorld);
  }
  printline("PAPI real cycles : "+ttos(papi_values[hw_counters]),
  std::cout,parameters.myrankWorld);
  printline("PAPI real usecs : "+ttos(papi_values[hw_counters+1]),
  std::cout,parameters.myrankWorld);
  printline("PAPI user cycles : "+ttos(papi_values[hw_counters+2]),
  std::cout,parameters.myrankWorld);
  printline("PAPI user usecs : "+ttos(papi_values[hw_counters+3]),
  std::cout,parameters.myrankWorld);
*/
  
  //MPI_Reduce(papi_values,accumulated_counters,hw_counters+4,
  //           MPI_LONG,MPI_SUM,0,MPI_COMM_WORLD);

  shmem_long_sum_to_all(accumulated_counters, papi_values, hw_counters+4,
      comm.pestart, comm.logPE_stride, comm.size, pWrk_i, pSync2);



  if(world_rank==0)
  {
    for(int i=0; i<hw_counters; i++)
    {
      std::cout<<"Accumulated: "<<(papi_event_name[i])<<" = "<<(accumulated_counters[i])<<"\n";
    }
    std::cout<<"PAPI accumulated real cycles : "<<(accumulated_counters[hw_counters])<<"\n";
    std::cout<<"PAPI accumulated user cycles : "<<(accumulated_counters[hw_counters+2])<<"\n";
    double gflops_papi = ((double)accumulated_counters[1])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gflops_hw_double = ((double)accumulated_counters[2])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gflops_hw_single = ((double)accumulated_counters[3])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gips = ((double)accumulated_counters[0])/(1000.0*(double)papi_values[hw_counters+1]);
    std::cout<<"PAPI_FP_OPS real GFLOP/s : "<<(gflops_papi)<<"\n";
    std::cout<<"PAPI hw double real GFLOP/s : "<<(gflops_hw_double)<<"\n";
    std::cout<<"PAPI hw single real GFLOP/s : "<<(gflops_hw_single)<<"\n";
    std::cout<<"PAPI real GINST/s : "<<(gips)<<"\n";
  }
#endif


  //MPI_Finalize();
  return 0;
}
Beispiel #23
0
void* test(void *data) {
  int unext, last = -1; 
  val_t val = 0;
  pval_t pval = 0;

  thread_data_t *d = (thread_data_t *)data;

  /* Create transaction */
  TM_THREAD_ENTER(d->id);
  set_cpu(the_cores[d->id]);
  /* Wait on barrier */
  ssalloc_init();
  PF_CORRECTION;

  seeds = seed_rand();

#ifdef PIN
  int id = d->id;
  int cpu = 40*(id/40) + 4*(id%10) + (id%40)/10;
  // printf("Pinning %d to %d\n",id,cpu);
  pin(pthread_self(), cpu);
  //  pin(pthread_self(), id);
#endif

 #ifdef PAPI
    if (PAPI_OK != PAPI_start_counters(g_events, G_EVENT_COUNT))
  {
    printf("Problem starting counters 1.");
  }
 #endif


  barrier_cross(d->barrier);

  /* Is the first op an update? */
  unext = (rand_range_re(&d->seed, 100) - 1 < d->update);

#ifdef DISTRIBUTION_EXPERIMENT
  while (1)
#else
  while (*running)
#endif
    {		
      if (d->es) { // event simulator experiment
        if (d->lin) {
          if (!empty(d->linden_set)) {
            d->nb_remove++;
            pval_t pval = deletemin(d->linden_set, d);
            d->nb_removed++;

  //           printf("%d %d\n", pval, deps[pval][0]);

            int i = 0;
            val_t dep;
            while ((dep = deps[pval][i]) != -1 && i < MAX_DEPS) {
              d->nb_add++;
              if (insert(d->linden_set, dep, dep)) {
                d->nb_added++;
              }
              i++;
            }
          }
        } else {
          if (d->set->head->next[0]->next[0] != NULL) {// set not empty
            d->nb_remove++;
            if (d->sl) { // spray list
              if (spray_delete_min(d->set, &val, d)) {
                d->nb_removed++;
              } else {
                continue;
              }
            } else if (d->pq) { // lotan_shavit pq
              if (lotan_shavit_delete_min(d->set, &val, d)) {
                d->nb_removed++;
                //         continue; // TODO: maybe try remove this to simulate task handling (dependency checks still occur)
              } else {
                continue;
              }
            }

            //         struct timespec ten_usec;
            //         ten_usec.tv_sec = 0;
            //         ten_usec.tv_nsec = 10000;
            //         nanosleep(&ten_usec, NULL);

            // dependency handling
            int i = 0;
            val_t dep;
            while ((dep = deps[val][i]) != -1 && i < MAX_DEPS) {
              if (!sl_contains(d->set, dep, TRANSACTIONAL)) { // dependent has been removed, need to add it again
                if (sl_add(d->set, dep, TRANSACTIONAL)) { // check if insert actually succeeded (otherwise someone else did it first)
                  d->nb_added++;
                }
                d->nb_add++;
              }
              i++;
            }
          }
        }
      } else { // not event simulator
        if (unext) { // update

          if (last < 0) { // add
            val = rand_range_re(&d->seed, d->range);
            if (d->lin) {
              pval = val;
              insert(d->linden_set, pval, pval);
              d->nb_added++;
              last = pval;
            } else { // not linden
              if (sl_add(d->set, val, TRANSACTIONAL)) {
                d->nb_added++;
                last = val;
              } 				
            }
            d->nb_add++;

          } else { // remove

            if (d->pq) {
              if (lotan_shavit_delete_min(d->set, &val, d)) {
                d->nb_removed++;
                if (d->first_remove == -1) {
                  d->first_remove = val;
                }
              }
                last = -1;
            }
            else if (d->sl) {
              if (spray_delete_min(d->set, &val, d)) {
                d->nb_removed++;
                if (d->first_remove == -1) {
                  d->first_remove = val;
                }
                last = -1;
              }
            }
            else if (d->lin) {
              if ((pval = deletemin(d->linden_set, d))) {
                d->nb_removed++;
                if (d->first_remove == -1) {
                  d->first_remove = pval;
                }
                last = -1;
              }
            }
            else if (d->alternate) { // alternate mode (default)
              if (sl_remove(d->set, last, TRANSACTIONAL)) {
                d->nb_removed++;
                if (d->first_remove == -1) {
                  d->first_remove = val;
                }
              } 
              last = -1;
            } else {
              /* Random computation only in non-alternated cases */
              val = rand_range_re(&d->seed, d->range);
              /* Remove one random value */
              if (sl_remove_succ(d->set, val, TRANSACTIONAL)) {
                d->nb_removed++;
                if (d->first_remove == -1) {
                  d->first_remove = val;
                }
                /* Repeat until successful, to avoid size variations */
                last = -1;
              } 
            }
            d->nb_remove++;
          }

        } else { // read

          if (d->alternate) {
            if (d->update == 0) {
              if (last < 0) {
                val = d->first;
                last = val;
              } else { // last >= 0
                val = rand_range_re(&d->seed, d->range);
                last = -1;
              }
            } else { // update != 0
              if (last < 0) {
                val = rand_range_re(&d->seed, d->range);
                //last = val;
              } else {
                val = last;
              }
            }
          }	else val = rand_range_re(&d->seed, d->range);

          PF_START(2);
          if (sl_contains(d->set, val, TRANSACTIONAL)) 
            d->nb_found++;
          PF_STOP(2);	
          d->nb_contains++;
        }

        /* Is the next op an update? */
        if (d->effective) { // a failed remove/add is a read-only tx
          unext = ((100 * (d->nb_added + d->nb_removed))
              < (d->update * (d->nb_add + d->nb_remove + d->nb_contains)));
        } else { // remove/add (even failed) is considered as an update
          unext = (rand_range_re(&d->seed, 100) - 1 < d->update);
        }
      }

#ifdef DISTRIBUTION_EXPERIMENT
      if (d->first_remove != -1) {
        break; //only one run
      }
#endif

    }
#ifdef PAPI
  if (PAPI_OK != PAPI_read_counters(g_values[d->id], G_EVENT_COUNT))
  {
    printf("Problem reading counters 2.");
  }
#endif

  /* Free transaction */
  TM_THREAD_EXIT();

  PF_PRINT;

  return NULL;
}
Beispiel #24
0
main(int argc, char *argv[])
{
    float **a,**b,**c;
    int n;
    int NB;
    int i,j;
    int x;
//double t0,t1;
    struct timeval t0,t1;
    long mtime, seconds, useconds;

// Using PAPI - from countloop.c
    if (PAPI_VER_CURRENT !=
            PAPI_library_init(PAPI_VER_CURRENT))
        ehandler("PAPI_library_init error.");

    const size_t EVENT_MAX = PAPI_num_counters();
// Suppressing output
//    printf("# Max counters = %zd\n", EVENT_MAX);

    if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS))
        ehandler("Cannot count PAPI_TOT_INS.");

    if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS))
        ehandler("Cannot count PAPI_FP_OPS.");

    if (PAPI_OK != PAPI_query_event(PAPI_L1_DCM))
        ehandler("Cannot count PAPI_L1_DCM.");

    size_t EVENT_COUNT = 3;
    int events[] = { PAPI_TOT_INS, PAPI_FP_OPS, PAPI_L1_DCM };
    long long values[EVENT_COUNT];

// Take size from args, not prompt
// printf("Enter n:  ");  scanf("%d",&n);  printf("n = %d\n",n);
    n = atoi(argv[1]);
    NB = atoi(argv[2]);

    a = matrix(1,n,1,n);
    for (i=1; i<=n; i++)
        for (j=1; j<=n; j++)
            a[i][j] = i+j;

    b = matrix(1,n,1,n);
    for (i=1; i<=n; i++)
        for (j=1; j<=n; j++)
            b[i][j] = i-j;


//t0 = get_seconds();
    gettimeofday(&t0, NULL);
// Start PAPI
    PAPI_start_counters(events, EVENT_COUNT);

    if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT))
        ehandler("Problem reading counters.");
//for (x=0;x<1000;x++){
    c = matrix_prod(n,n,n,n,a,b,NB);
//}
    if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT))
        ehandler("Problem reading counters.");

//t1 = get_seconds();
    gettimeofday(&t1, NULL);
    seconds = t1.tv_sec - t0.tv_sec;
    useconds = t1.tv_usec - t0.tv_usec;
    mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5;
//printf("Time for matrix_prod = %f sec\n",t1-t0);
    printf("%d\t%lld\t%lld\t%lld\t%ld\n", n, values[0], values[1],
           values[2], mtime);
}
Beispiel #25
0
int main(int argc, char **argv) {

   int events[1],i;
   long long counts[1];
   
   int retval,quiet;
   int l1_size,l2_size,l1_linesize,l2_entries;
   int arraysize;

   char test_string[]="Testing PAPI_L2_DCM predefined event...";
   
   quiet=test_quiet();

   retval = PAPI_library_init(PAPI_VER_CURRENT);
   if (retval != PAPI_VER_CURRENT) {
      if (!quiet) printf("Error! PAPI_library_init %d\n",retval);
      test_fail(test_string);
   }

   retval = PAPI_query_event(PAPI_L2_DCM);
   if (retval != PAPI_OK) {
      if (!quiet) printf("PAPI_L2_DCM not available\n");
      test_skip(test_string);
   }

   events[0]=PAPI_L2_DCM;

   l1_size=get_cachesize(L1D_CACHE,quiet,test_string);
   l1_linesize=get_linesize(L1D_CACHE,quiet,test_string);
   l2_size=get_cachesize(L2_CACHE,quiet,test_string);
   l2_entries=get_entries(L2_CACHE,quiet,test_string);

   /*******************************************************************/
   /* Test if the C compiler uses a sane number of data cache acceess */
   /*******************************************************************/

   arraysize=l2_size/sizeof(double);

   double *array;
   double aSumm = 0.0;

   if (!quiet) {
      printf("Allocating %ld bytes of memory (%d doubles)\n",
          arraysize*sizeof(double),arraysize);
   }

   array=calloc(arraysize,sizeof(double));
   if (array==NULL) {
      if (!quiet) printf("Error! Can't allocate memory\n");
      test_fail(test_string);
   }

   if (!quiet) printf("Write test:\n");
   PAPI_start_counters(events,1);
   
   for(i=0; i<arraysize; i++) { 
      array[i]=(double)i;
   }
     
   PAPI_stop_counters(counts,1);

   if (!quiet) {
      printf("\tL2 D misses: %lld\n",counts[0]);
      printf("\tShould be roughly (%d/(%d/%ld)): %ld\n",
          arraysize,l1_linesize,sizeof(double),
          arraysize/(l1_linesize/sizeof(double)));
   }

   PAPI_start_counters(events,1);
   
   for(i=0; i<arraysize; i++) { 
       aSumm += array[i]; 
   }
     
   PAPI_stop_counters(counts,1);

   if (!quiet) {
      printf("Read test (%lf):\n",aSumm);
      printf("\tL2 D misses: %lld\n",counts[0]);
      printf("\tShould be roughly (%d/(%d/%ld)): %ld\n",
          arraysize,l1_linesize,sizeof(double),
          arraysize/(l1_linesize/sizeof(double)));
   }

   PAPI_shutdown();

   test_pass(test_string);
   
   return 0;
}
Beispiel #26
0
  /**
   * The main host function called from outside, as part of the API for a single node.
   */
  unsigned int NumericFormFactorC::compute_form_factor(int rank,
//            #ifndef __SSE3__
              real_vec_t &shape_def,
//            #else
//              real_t* shape_def, unsigned int num_triangles,
//            #endif
            complex_t* &ff,
            real_t* &qx, int nqx, real_t* &qy, int nqy, complex_t* &qz, int nqz,
            real_t* &rot,
            real_t& kernel_time, real_t& red_time, real_t& mem_time
            #ifdef FINDBLOCK
              , const int block_x, const int block_y, const int block_z, const int block_t
            #endif
            ) {
    double temp_mem_time = 0.0, total_mem_time = 0.0;
    #ifdef _OPENMP
      if(rank == 0)
        std::cout << "++      Number of OpenMP threads: " << omp_get_max_threads() << std::endl;
    #endif
  
//    #ifndef __SSE3__
      unsigned int num_triangles = shape_def.size() / CPU_T_PROP_SIZE_;
//    #endif
    if(num_triangles < 1) return 0;

//    #ifdef INTEL_SB_AVX
//      unsigned int shape_padding = (32 - (num_triangles & 31)) & 31;
//    #elif defined __SSE3__
//      unsigned int shape_padding = (16 - (num_triangles & 15)) & 15;
//    #endif
  
    //#ifndef FF_NUM_CPU_PADDING
      unsigned long int total_qpoints = nqx * nqy * nqz;
      unsigned long int host_mem_usage = ((unsigned long int) nqx + nqy) * sizeof(real_t) +
                        nqz * sizeof(complex_t);
    //#else
      // padding to 16 bytes
      //const unsigned int PAD_LINE_ = 16;
      //unsigned int pad_x = 0;
      //if(nqx != 1) pad_x = (PAD_LINE_ - (nqx % PAD_LINE_)) % PAD_LINE_;
      //unsigned int pad_y = (PAD_LINE_ - (nqy % PAD_LINE_)) % PAD_LINE_;
      //unsigned int pad_z = (PAD_LINE_ - (nqz % PAD_LINE_)) % PAD_LINE_;
      //unsigned int pnqx = nqx + pad_x, pnqy = nqy + pad_y, pnqz = nqz + pad_z;
      //unsigned long int total_qpoints = pnqx * pnqy * pnqz;
      //unsigned long int host_mem_usage = ((unsigned long int) pnqx + pnqy) * sizeof(real_t) +
      //                  pnqz * sizeof(complex_t);
    //#endif
  
    // allocate memory for the final FF 3D matrix
    ff = new (std::nothrow) complex_t[total_qpoints];  // allocate and initialize to 0
    memset(ff, 0, total_qpoints * sizeof(complex_t));
    if(ff == NULL) {
      std::cerr << "Memory allocation failed for ff. Size = "
            << total_qpoints * sizeof(complex_t) << " b" << std::endl;
      return 0;
    } // if
    host_mem_usage += total_qpoints * sizeof(complex_t);
  
    //unsigned long int matrix_size = (unsigned long int) nqx * nqy * nqz * num_triangles;
    
    // do hyperblocking to use less memory
    unsigned int b_nqx = 0, b_nqy = 0, b_nqz = 0, b_num_triangles = 0;
    #ifndef FF_NUM_CPU_AUTOTUNE_HB
      compute_block_size(nqx, nqy, nqz, num_triangles,
                b_nqx, b_nqy, b_nqz, b_num_triangles
                #ifdef FINDBLOCK
                  , block_x, block_y, block_z, block_t
                #endif
                );
    #else
      std::cout << "-- Autotuning hyperblock size ... " << std::endl;
      double min_time_hb = 1000000.0;
      unsigned int min_b_nqx = 1, min_b_nqy = 1, min_b_nqz = 1, min_b_num_triangles = 1;
      woo::BoostChronoTimer at_kernel_timer, at_overhead_timer;
      at_overhead_timer.start();
      complex_t* ff_temp;
      ff_temp = new (std::nothrow) complex_t[nqx * nqy * nqz];
      for(int b_nqx_i = 1; b_nqx_i <= nqx; ++ b_nqx_i) {
        for(int b_nqy_i = 10; b_nqy_i <= nqy; b_nqy_i += 10) {
          for(int b_nqz_i = 10; b_nqz_i <= nqz; b_nqz_i += 10) {
            for(int b_nt_i = 10; b_nt_i <= num_triangles; b_nt_i += 10) {
              at_kernel_timer.start();

              // compute the number of sub-blocks, along each of the 4 dimensions
              unsigned int nb_x = (unsigned int) ceil((float) nqx / b_nqx_i);
              unsigned int nb_y = (unsigned int) ceil((float) nqy / b_nqy_i);
              unsigned int nb_z = (unsigned int) ceil((float) nqz / b_nqz_i);
              unsigned int nb_t = (unsigned int) ceil((float) num_triangles / b_nt_i);
              unsigned int num_blocks = nb_x * nb_y * nb_z * nb_t;

              form_factor_kernel_fused_nqx1(qx, qy, qz, shape_def,
                  b_nqx_i, b_nqy_i, b_nqz_i, b_nt_i,
                  b_nqx_i, b_nqy_i, b_nqz_i, b_nt_i,
                  nqx, nqy, nqz, num_triangles,
                  0, 0, 0, 0,
                  rot,
                  ff);

              at_kernel_timer.stop();
              double curr_time = at_kernel_timer.elapsed_msec();
              double tot_time = curr_time * num_blocks;
              std::cout << "## " << b_nqx_i << " x " << b_nqy_i << " x " << b_nqz_i
                    << " x " << b_nt_i << "\t" << num_blocks << "\t:\t"
                    << curr_time << "\t" << tot_time << std::endl;
              if(tot_time < min_time_hb) {
                min_time_hb = tot_time;
                min_b_nqx = b_nqx_i; min_b_nqy = b_nqy_i; min_b_nqz = b_nqz_i;
                min_b_num_triangles = b_nt_i;
              } // if
            } // for
          } // for
        } // for
      } // for
      delete[] ff_temp;
      at_overhead_timer.stop();

      b_nqx = min_b_nqx; b_nqy = min_b_nqy; b_nqz = min_b_nqz; b_num_triangles = min_b_num_triangles;
      if(rank == 0) {
        std::cout << "##    HBlock Autotuner overhead: " << at_overhead_timer.elapsed_msec()
              << " ms." << std::endl;
      } // if
    #endif
  
    unsigned long int blocked_3d_matrix_size = (unsigned long int) b_nqx * b_nqy * b_nqz;
    
    //size_t estimated_host_mem_need = host_mem_usage + blocked_matrix_size * sizeof(complex_t);
    //if(rank == 0) {
    //  std::cout << "++    Estimated host memory need: " << (float) estimated_host_mem_need / 1024 / 1024
    //        << " MB" << std::endl;
    //} // if
    #ifndef FF_NUM_CPU_FUSED
      unsigned long int blocked_matrix_size =
                    (unsigned long int) blocked_3d_matrix_size * b_num_triangles;
      host_mem_usage += blocked_matrix_size * sizeof(complex_t);
      complex_t *fq_buffer = new (std::nothrow) complex_t[blocked_matrix_size]();
      if(fq_buffer == NULL) {
        std::cerr << "Memory allocation failed for fq_buffer. blocked_matrix_size = "
              << blocked_matrix_size << std::endl
              << "Host memory usage = " << (float) host_mem_usage / 1024 / 1024 << " MB"
              << std::endl;
        delete[] ff;
        return 0;
      } // if
    #endif
    if(rank == 0) {
      std::cout << "++             Host memory usage: " << (float) host_mem_usage / 1024 / 1024
            << " MB" << std::endl << std::flush;
    } // if

    // compute the number of sub-blocks, along each of the 4 dimensions
    // formulate loops over each dimension, to go over each sub block
    unsigned int nb_x = (unsigned int) ceil((float) nqx / b_nqx);
    unsigned int nb_y = (unsigned int) ceil((float) nqy / b_nqy);
    unsigned int nb_z = (unsigned int) ceil((float) nqz / b_nqz);
    unsigned int nb_t = (unsigned int) ceil((float) num_triangles / b_num_triangles);

    unsigned int curr_b_nqx = b_nqx, curr_b_nqy = b_nqy, curr_b_nqz = b_nqz;
    unsigned int curr_b_num_triangles = b_num_triangles;
    unsigned int num_blocks = nb_x * nb_y * nb_z * nb_t;

    #ifdef TIME_DETAIL_2
      if(rank == 0) {
        std::cout << "++               Hyperblock size: " << b_nqx << " x " << b_nqy
              << " x " << b_nqz << " x " << b_num_triangles << std::endl;
        std::cout << "++  Number of decomposed Hblocks: " << num_blocks
              << " [" << nb_x << " x " << nb_y << " x " << nb_z << " x " << nb_t << "]"
              << std::endl;
      } // if
    #endif // TIME_DETAIL_2

    unsigned int block_num = 0;

    #ifdef PROFILE_PAPI
      long long int papi_total_cycles = 0, papi_total_inst = 0, papi_total_flop = 0;
      double overall_ipc = 0.0;
    #endif

    if(rank == 0) std::cout << "-- Computing form factor on CPU ... " << std::flush;

    woo::BoostChronoTimer kernel_timer;
    kernel_timer.start();

    // compute for each hyperblock
    curr_b_nqx = b_nqx;
    for(unsigned int ib_x = 0; ib_x < nb_x; ++ ib_x) {
      if(ib_x == nb_x - 1) curr_b_nqx = nqx - b_nqx * ib_x;
      curr_b_nqy = b_nqy;
      for(unsigned int ib_y = 0; ib_y < nb_y; ++ ib_y) {
        if(ib_y == nb_y - 1) curr_b_nqy = nqy - b_nqy * ib_y;
        curr_b_nqz = b_nqz;
        for(unsigned int ib_z = 0; ib_z < nb_z; ++ ib_z) {
          if(ib_z == nb_z - 1) curr_b_nqz = nqz - b_nqz * ib_z;
          curr_b_num_triangles = b_num_triangles;
          for(unsigned int ib_t = 0; ib_t < nb_t; ++ ib_t) {
            if(ib_t == nb_t - 1)
              curr_b_num_triangles = num_triangles - b_num_triangles * ib_t;

            #ifdef PROFILE_PAPI
              // PAPI_L1_DCM  0x80000000  No   Level 1 data cache misses
              // PAPI_L1_ICM  0x80000001  No   Level 1 instruction cache misses
              // PAPI_L2_DCM  0x80000002  No   Level 2 data cache misses
              // PAPI_L2_ICM  0x80000003  No   Level 2 instruction cache misses
              // PAPI_L1_TCM  0x80000006  Yes  Level 1 cache misses
              // PAPI_L2_TCM  0x80000007  No   Level 2 cache misses
              // PAPI_FPU_IDL 0x80000012  No   Cycles floating point units are idle
              // PAPI_TLB_DM  0x80000014  No   Data translation lookaside buffer misses
              // PAPI_TLB_IM  0x80000015  No   Instruction translation lookaside buffer misses
              // PAPI_TLB_TL  0x80000016  Yes  Total translation lookaside buffer misses
              // PAPI_STL_ICY 0x80000025  No   Cycles with no instruction issue
              // PAPI_HW_INT  0x80000029  No   Hardware interrupts
              // PAPI_BR_TKN  0x8000002c  No   Conditional branch instructions taken
              // PAPI_BR_MSP  0x8000002e  No   Conditional branch instructions mispredicted
              // PAPI_TOT_INS 0x80000032  No   Instructions completed
              // PAPI_FP_INS  0x80000034  No   Floating point instructions
              // PAPI_BR_INS  0x80000037  No   Branch instructions
              // PAPI_VEC_INS 0x80000038  No   Vector/SIMD instructions (could include integer)
              // PAPI_RES_STL 0x80000039  No   Cycles stalled on any resource
              // PAPI_TOT_CYC 0x8000003b  No   Total cycles
              // PAPI_L1_DCH  0x8000003e  Yes  Level 1 data cache hits
              // PAPI_L2_DCH  0x8000003f  Yes  Level 2 data cache hits
              // PAPI_L1_DCA  0x80000040  No   Level 1 data cache accesses
              // PAPI_L2_DCA  0x80000041  No   Level 2 data cache accesses
              // PAPI_L1_ICH  0x80000049  Yes  Level 1 instruction cache hits
              // PAPI_L2_ICH  0x8000004a  No   Level 2 instruction cache hits
              // PAPI_L1_ICA  0x8000004c  No   Level 1 instruction cache accesses
              // PAPI_L2_ICA  0x8000004d  No   Level 2 instruction cache accesses
              // PAPI_L1_ICR  0x8000004f  No   Level 1 instruction cache reads
              // PAPI_L1_TCH  0x80000055  Yes  Level 1 total cache hits
              // PAPI_L2_TCH  0x80000056  Yes  Level 2 total cache hits
              // PAPI_L1_TCA  0x80000058  Yes  Level 1 total cache accesses
              // PAPI_L2_TCA  0x80000059  No   Level 2 total cache accesses
              // PAPI_FML_INS 0x80000061  No   Floating point multiply instructions
              // PAPI_FAD_INS 0x80000062  No   Floating point add instructions
              //                               (Also includes subtract instructions)
              // PAPI_FDV_INS 0x80000063  No   Floating point divide instructions
              //                               (Counts both divide and square root instructions)
              // PAPI_FSQ_INS 0x80000064  No   Floating point square root instructions
              //                               (Counts both divide and square root instructions)
              // PAPI_FP_OPS  0x80000066  No   Floating point operations
              // PAPI_SP_OPS  0x80000067  No   Floating point operations; optimized to count
              //                               scaled single precision vector operations
              // PAPI_DP_OPS  0x80000068  No   Floating point operations; optimized to count
              //                               scaled double precision vector operations

              int papi_events[3] = { PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_FP_OPS };
              //int papi_events[3] = { PAPI_FML_INS, PAPI_FAD_INS, PAPI_FDV_INS };
              //int papi_events[3] = { PAPI_FP_OPS, PAPI_SP_OPS, PAPI_DP_OPS };
              long long  papi_counter_values[3];
              PAPI_start_counters(papi_events, 3);
            #endif

            // call the main kernel
            #ifndef FF_NUM_CPU_FUSED // DO NOT USE THIS
              form_factor_kernel(qx, qy, qz, shape_def,
                  curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles,
                  b_nqx, b_nqy, b_nqz, b_num_triangles,
                  ib_x, ib_y, ib_z, ib_t,
                  fq_buffer);
            #else
              if(nqx == 1) {
                form_factor_kernel_fused_nqx1(qx, qy, qz, shape_def,
                //form_factor_kernel_fused_nqx1_unroll4(qx, qy, qz, shape_def,
                    curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles,
                    b_nqx, b_nqy, b_nqz, b_num_triangles,
                    nqx, nqy, nqz, num_triangles,
                    ib_x, ib_y, ib_z, ib_t,
                    rot,
                    ff);
              } else {
//                #ifdef __SSE3__
//                  if(rank == 0)
//                    std::cout << "uh-oh: no SSE3 version!" << std::endl;
//                #else
                  form_factor_kernel_fused_unroll4(qx, qy, qz, shape_def,
                    curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles,
                    b_nqx, b_nqy, b_nqz, b_num_triangles,
                    nqx, nqy, nqz, num_triangles,
                    ib_x, ib_y, ib_z, ib_t,
                    rot,
                    ff);
//                #endif // __SSE3__
              } // if-else
            #endif

            #ifndef FF_NUM_CPU_FUSED // DO NOT USE THIS
              // call the reduction kernel
              reduction_kernel(curr_b_nqx, curr_b_nqy, curr_b_nqz,
                  curr_b_num_triangles, blocked_matrix_size,
                  b_nqx, b_nqy, b_nqz, num_triangles,
                  nqx, nqy, nqz,
                  ib_x, ib_y, ib_z, ib_t,
                  fq_buffer, ff);
            #endif

            #ifdef PROFILE_PAPI
              PAPI_stop_counters(papi_counter_values, 3);
              papi_total_cycles += papi_counter_values[0];
              papi_total_inst += papi_counter_values[1];
              papi_total_flop += papi_counter_values[2];
            #endif
          } // for ib_t
        } // for ib_z
      } // for ib_y
    } // for ib_x

    kernel_timer.stop();
    kernel_time = kernel_timer.elapsed_msec();

    #ifndef FF_NUM_CPU_FUSED
      delete[] fq_buffer;
    #endif

    if(rank == 0) std::cout << "done." << std::endl;

    #ifdef PROFILE_PAPI
      if(rank == 0) {
        std::cout << "++                  PAPI_TOT_CYC: " << papi_total_cycles << std::endl;
        std::cout << "++                  PAPI_TOT_INS: " << papi_total_inst << std::endl;
        std::cout << "++                   PAPI_FP_OPS: " << papi_total_flop << std::endl;
        std::cout << "++                           IPC: "
              << (double) papi_total_inst / papi_total_cycles << std::endl;
      } // if
    #endif

    return num_triangles;
  } // NumericFormFactorC::compute_form_factor()
Beispiel #27
0
main(int argc, char *argv[])
{
float **a,**b,**c;
int n,n1,n2;
int i,j;
//double t0,t1;
struct timeval t0,t1;
long mtime, seconds, useconds;

// Using PAPI - from countloop.c
if (PAPI_VER_CURRENT !=
    PAPI_library_init(PAPI_VER_CURRENT))
    ehandler("PAPI_library_init error.");

const size_t EVENT_MAX = PAPI_num_counters();
// Suppressing output
//    printf("# Max counters = %zd\n", EVENT_MAX);

if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS))
    ehandler("Cannot count PAPI_TOT_INS.");

if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS))
    ehandler("Cannot count PAPI_FP_OPS.");

if (PAPI_OK != PAPI_query_event(PAPI_L1_DCM))
    ehandler("Cannot count PAPI_L1_DCM.");

size_t EVENT_COUNT = 3;
int events[] = { PAPI_TOT_INS, PAPI_FP_OPS, PAPI_L1_DCM };
long long values[EVENT_COUNT];

// Take size from args, not prompt
// printf("Enter n:  ");  scanf("%d",&n);  printf("n = %d\n",n);
n = atoi(argv[1]);

//printf("Enter n1:  ");  scanf("%d",&n1);  printf("n1 = %d\n",n1);
//printf("Enter n2:  ");  scanf("%d",&n2);  printf("n2 = %d\n",n2);

// To conform to the other matrix functions
n1 = floor(sqrt(n));
n2 = n1;
n = n1*n2;
//printf("n = %d X %d = %d\n",n1,n2,n);
a = matrix(1,n,1,n);
for (i=1;i<=n;i++) 
    for (j=1;j<=n;j++) 
        a[i][j] = i+j;

b = matrix(1,n,1,n);
for (i=1;i<=n;i++) 
    for (j=1;j<=n;j++) 
        b[i][j] = i-j;

//#ifdef PRINT
//print_matrix(a,1,n,1,n);
//printf("\n"); */
//print_matrix(b,1,n,1,n);
//printf("\n"); */
//#endif

//t0 = get_seconds();
//c = matrix_prod(n,n,n,n,a,b);
//t1 = get_seconds();
//printf("Time for matrix_prod = %f sec\n",t1-t0);

//t0 = get_seconds();
gettimeofday(&t0, NULL);
// Start PAPI
PAPI_start_counters(events, EVENT_COUNT);

if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT))
    ehandler("Problem reading counters.");

c = block_prod(n1,n1,n1,n2,n2,n2,a,b);

if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT))
    ehandler("Problem reading counters.");

//t1 = get_seconds();
//printf("Time for block_prod = %f sec\n",t1-t0);
gettimeofday(&t1, NULL);
seconds = t1.tv_sec - t0.tv_sec;
useconds = t1.tv_usec - t0.tv_usec;
mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5;
//printf("Time for matrix_prod = %f sec\n",t1-t0);
printf("%d\t%lld\t%lld\t%lld\t%ld\n", n, values[0], values[1],
    values[2], mtime);
}
Beispiel #28
0
int run_nothing(void *_p, unsigned long long *data, int *data_len) {
  struct elim_params *p = (struct elim_params *)_p;

  mzd_t *A = mzd_init(p->m, p->n);

  if(p->r != 0) {
    mzd_t *L, *U;
    L = mzd_init(p->m, p->m);
    U = mzd_init(p->m, p->n);
    mzd_randomize(U);
    mzd_randomize(L);
    for (rci_t i = 0; i < p->m; ++i) {

      for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) {
        int const length = MIN(m4ri_radix, p->m - j);
        mzd_clear_bits(L, i, j, length);
      }
      mzd_write_bit(L,i,i, 1);

      for (rci_t j = 0; j < i && j <p->n; j+=m4ri_radix) {
        int const length = MIN(m4ri_radix, i - j);
        mzd_clear_bits(U, i, j, length);
      }
      if(i < p->r) {
        mzd_write_bit(U, i, i, 1);
      } else {
        for (rci_t j = i; j < p->n; j+=m4ri_radix) {
          int const length = MIN(m4ri_radix, p->n - j);
          mzd_clear_bits(U, i, j, length);
        }
      }
    }
    mzd_mul(A,L,U,0);
    mzd_free(L);
    mzd_free(U);
  } else {
    mzd_randomize(A);
  }

#ifndef HAVE_LIBPAPI
  *data_len = 2;
#else
  *data_len = MIN(papi_array_len + 1, *data_len);
#endif
  int papi_res;

#ifndef HAVE_LIBPAPI
  data[0] = walltime(0);
  data[1] = cpucycles();
#else
  int array_len = *data_len - 1;
  unsigned long long t0 = PAPI_get_virt_usec();
  papi_res = PAPI_start_counters((int*)papi_events, array_len);
  if(papi_res)
    m4ri_die("");
#endif

#ifndef HAVE_LIBPAPI
  data[1] = cpucycles() - data[1];
  data[0] = walltime(data[0]);
#else
  PAPI_stop_counters((long long*)&data[1], array_len);
  t0 = PAPI_get_virt_usec() - t0;
  data[0] = t0;
  for (int nv = 0; nv <= array_len; ++nv) {
    if (data[nv] < loop_calibration[nv])
      loop_calibration[nv] = data[nv];
  }
#endif

  mzd_free(A);

  return (0);
}
Beispiel #29
0
void M3_profile( int sectionID, const char *sectionName, int operationFlag )
{
  static char *staticTitleString = NULL;
  static char **staticProfileName = NULL;
  static int64_t *staticNumCalls = NULL;
  static double *staticTotalTime = NULL;
  static double *staticStartTime = NULL;
#ifdef USE_PAPI
  static int64_t *staticFlopCount = NULL;
  static int64_t *staticFlipCount = NULL;
  static int64_t *staticFlopCounter = NULL;
  static int64_t *staticFlipCounter = NULL;
#endif
  static double staticInitTime = 0;
  static char staticInitDate[256]={0};
  static int staticProfileLevel = -1;

#ifdef USE_PAPI
#define M3_NUM_PAPI_EVENTS 2
  int papiEvents[M3_NUM_PAPI_EVENTS] = {PAPI_FP_OPS, PAPI_FP_INS};
  static long long int papiCounters[M3_NUM_PAPI_EVENTS] = {0};
#endif
  double finalTime;
  int64_t *agInt64 = NULL;
  double *agDouble = NULL;
  int64_t i, j;
  long int k;
  int myRank = -1;
  int numProc = 1;
  FILE *outFile;  
  char *tempPtr, fileName[256], tempString[256];
  char myHostname[256] = {0};
  double mpiTic;
  double mpiToc;
  struct timeval tic;
  struct timezone tz;
  time_t tt;
  long int pid;
  char pcontrolID[16] = {0};

  if( staticProfileLevel == -1 )
  {
    /* Look for environment variable.  */
    tempPtr = getenv("M3_PROFILE_LEVEL");
    if( tempPtr )
      staticProfileLevel = atoi( tempPtr );
    else
      staticProfileLevel = M3_PROFILE_LEVEL;
  }

  if( staticProfileLevel == 0 )
    return;



#ifdef USE_MPI
  MPI_Comm_rank(MPI_COMM_WORLD, &myRank );
  MPI_Comm_size(MPI_COMM_WORLD, &numProc );
#endif 

  sprintf(fileName, "M3_Profile():  profile ID out of range, must be between 0 and %i", M3_PROFILE_MAX_SECTIONS - 1);
  assert(sectionID >= 0 && sectionID < M3_PROFILE_MAX_SECTIONS);

  switch( operationFlag )
  {
    case M3_PROFILE_INIT:
      assert(staticProfileName == NULL &&
	     staticNumCalls == NULL && 
	     staticTotalTime == NULL && 
	     staticStartTime == NULL);
      if( sectionName && strlen(sectionName) )
      {
        staticTitleString = (char *)calloc( 4*(strlen(sectionName)/4 +1 ), sizeof(char) );
        assert(staticTitleString != NULL);
        strcpy(staticTitleString, sectionName );
      }

      staticProfileName = (char **)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(char*) );
      staticNumCalls = (int64_t *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(int64_t) );
      staticTotalTime = (double *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(double) );
      staticStartTime = (double *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(double) );
      assert(staticProfileName && staticNumCalls && staticTotalTime && staticStartTime);
#ifdef USE_PAPI
      staticFlopCount = (int64_t *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(int64_t));
      staticFlipCount = (int64_t *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(int64_t));
      staticFlopCounter = (int64_t *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(int64_t));
      staticFlipCounter = (int64_t *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(int64_t));
      assert(staticFlopCount && staticFlipCount);
      assert(staticFlopCounter && staticFlipCounter);
#endif

      gettimeofday(&tic, &tz);
#ifdef USE_MPI
      staticInitTime = MPI_Wtime( );
#else
      staticInitTime = tic.tv_sec + tic.tv_usec*1e-6;
#endif
      tt = tic.tv_sec;
      ctime_r(&tt, staticInitDate );

#ifdef USE_PAPI      
      PAPI_start_counters(papiEvents, M3_NUM_PAPI_EVENTS);
#endif

#ifdef USE_MPI
      if (myRank == 0) {
	mkdir( "m3_profile", S_IRWXU );
      }
#else
      mkdir( "m3_profile", S_IRWXU );
#endif

      break;
    case M3_PROFILE_FINALIZE:
      /* Check to see if it was initialized */
      if( staticProfileName == NULL ||
          staticNumCalls == NULL || 
          staticTotalTime == NULL )
      {
	/*        fprintf(stderr, "WARNING:  M3_Profile, finalized without initializing\n");  */
        break;
      }

      myHostname[255] = 0;
      gethostname(myHostname, 255);
      pid = (long int)getpid();  

      for( j = 0; j < 2; j++ )
      {
#ifdef USE_MPI
        if( j == 1 )
	{
          /* Get aggregate statistics */
          if( myRank == 0 )
	  {
            agInt64 = (int64_t*)calloc(M3_PROFILE_MAX_SECTIONS, sizeof(int64_t));
            agDouble = (double*)calloc(M3_PROFILE_MAX_SECTIONS, sizeof(double));
            assert( agInt64 && agDouble );
          }
          MPI_Reduce( staticNumCalls, agInt64, M3_PROFILE_MAX_SECTIONS, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD );
          if( myRank == 0 )
            memcpy( staticNumCalls, agInt64 , sizeof(int64_t)*M3_PROFILE_MAX_SECTIONS );
          
          MPI_Reduce( staticTotalTime, agDouble, M3_PROFILE_MAX_SECTIONS, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD );
          if( myRank == 0 )
            memcpy( staticTotalTime, agDouble , sizeof(double)*M3_PROFILE_MAX_SECTIONS );
#ifdef USE_PAPI
          MPI_Reduce( staticFlopCount, agInt64, M3_PROFILE_MAX_SECTIONS, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD );
          if( myRank == 0 )
            memcpy( staticFlopCount, agInt64, sizeof(int64_t)*M3_PROFILE_MAX_SECTIONS );
          MPI_Reduce( staticFlipCount, agInt64, M3_PROFILE_MAX_SECTIONS, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD );
          if( myRank == 0 )
            memcpy( staticFlipCount, agInt64, sizeof(int64_t)*M3_PROFILE_MAX_SECTIONS );
#endif
          if( myRank == 0 )
	  {
            free(agInt64);
            free(agDouble);
          }
          else
            break;
        }
#else
        /* If not using mpi, don't need to collect aggregate statistics */
        if( j == 1 )
          break;
#endif

        k = 60*lrint(staticInitTime/60);

        /* m3_profile_title_date.proc */
        if( staticTitleString )
        {
          tempPtr = strchr( staticTitleString, ' ');
          if(tempPtr)
            *tempPtr = '\0';

          sprintf( fileName, "m3_profile/m3_profile_%s_%li_%s_%li", staticTitleString, k, myHostname, pid);

          if(tempPtr)
            *tempPtr = ' ';
        }
        else
	{
          sprintf( fileName, "m3_profile/m3_profile_%li", k );          
        }

#ifdef USE_MPI
        if( j == 0 ) 
          sprintf( tempString, ".%i", myRank );
        else
          strcpy( tempString, ".all"); 
        
        strcat( fileName, tempString );
#endif

        
        if( ( staticProfileLevel == 2 ) ||
            ( staticProfileLevel == 1 && j == 0 && numProc == 1 ) || 
            ( staticProfileLevel == 1 && j == 1 ) )
	{
          /* Open the output file.  */ 
          outFile = fopen( fileName, "w");
          assert(outFile != NULL);

          /* Write a title */
          if( staticTitleString )
            fprintf(outFile, "M3_Profile:  %s\n\n", staticTitleString );
          else
            fprintf(outFile, "M3_Profile\n\n" );
          

          /* Write the init date, and the run time.  */
#ifdef USE_MPI
          fprintf(outFile, "Number of processors:  %i\n", numProc );
          finalTime = MPI_Wtime();
#else
          gettimeofday(&tic, &tz );
          finalTime = tic.tv_sec + tic.tv_usec*1e-6;
#endif
          fprintf( outFile, "Start date %s\n", staticInitDate );
          fprintf( outFile, "Run time in seconds:  %e\n\n", finalTime - staticInitTime );

          if( j == 1 )
            fprintf(outFile, "Aggregate statistics\n\n");

          for( i = 0; i < M3_PROFILE_MAX_SECTIONS; i++ )
          {
            if( staticNumCalls[i] )
            {
              fprintf(outFile, "-----------------------------\n");
              fprintf(outFile, "    Profile ID number:  %lli\n", i);
              if( staticProfileName[i] )
                fprintf(outFile, "    %s\n", staticProfileName[i] );
              fprintf(outFile, "      Total number of calls:          %lli\n", staticNumCalls[i]);
              fprintf(outFile, "      Total time (seconds):           %e\n", staticTotalTime[i]);
              fprintf(outFile, "      Mean time per call (seconds):   %e\n", staticTotalTime[i]/staticNumCalls[i]);
              fprintf(outFile, "      Mean time per task (seconds):   %e\n", staticTotalTime[i]/numProc );
              fprintf(outFile, "      Percent of wall clock           %.6f %%\n", staticTotalTime[i]/numProc/(finalTime - staticInitTime)*100 );
#ifdef USE_PAPI
              fprintf(outFile, "      Flop count:                     %lli\n", staticFlopCount[i]);
              fprintf(outFile, "      Flop rate:                      %.6e\n", staticFlopCount[i]/staticTotalTime[i]);
              fprintf(outFile, "      Flip count:                     %lli\n", staticFlipCount[i]);
              fprintf(outFile, "      Flip rate:                      %.6e\n", staticFlipCount[i]/staticTotalTime[i]);
#endif
              fprintf(outFile, "\n\n");
            }
          }

          fclose(outFile);
        }
      }


      /* Free up static memory */
      if( staticTitleString )
      {
        free(staticTitleString);
        staticTitleString = NULL;
      }

      if( staticProfileName )
      {
        for( i = 0; i < M3_PROFILE_MAX_SECTIONS; i++ )
          if( staticProfileName[i] )
            free( staticProfileName[i] );
        free(staticProfileName);
        staticProfileName = NULL;
      }

      if( staticNumCalls )
      {
        free( staticNumCalls );
        staticNumCalls = NULL;
      }
      
      if( staticTotalTime )
      {
        free( staticTotalTime );
        staticTotalTime = NULL;
      }

      if( staticStartTime )
      {
        free(staticStartTime );
        staticStartTime = NULL;
      }
#ifdef USE_PAPI
      if( staticFlopCount );
      {
        free(staticFlopCount);
        staticFlopCount = NULL;
      }
      if( staticFlipCount );
      {
        free(staticFlipCount);
        staticFlipCount = NULL;
      }
#endif

      break;
    case M3_PROFILE_START:
      if( staticProfileName == NULL ||
          staticNumCalls == NULL || 
          staticTotalTime == NULL )
      {
	/*        fprintf(stderr, "WARNING:  M3_Profile, called without initializing\n");  */
        break;
      }
      if( staticProfileName[sectionID] == NULL )
      {
        staticProfileName[sectionID] = (char*)calloc(4*(strlen(sectionName)/4 + 1), sizeof(char));
        assert(staticProfileName[sectionID] != NULL);
        strcpy(staticProfileName[sectionID], sectionName);
      }
#ifdef USE_MPI
#ifndef USE_PAPI
      sprintf( pcontrolID, "%i", sectionID);
      MPI_Pcontrol( 1, pcontrolID );
#endif
#endif
#ifdef USE_MPI
      staticStartTime[sectionID] = MPI_Wtime();
#else
      gettimeofday(&tic, &tz);
      staticStartTime[sectionID] = tic.tv_sec + tic.tv_usec*1e-6;
#endif
#ifdef USE_PAPI
      PAPI_accum_counters(papiCounters, M3_NUM_PAPI_EVENTS );
      staticFlopCounter[sectionID] = papiCounters[0];
      staticFlipCounter[sectionID] = papiCounters[1];
#endif
      break;
    case M3_PROFILE_STOP:
      if( staticProfileName == NULL ||
          staticNumCalls == NULL || 
          staticTotalTime == NULL )
      {
	/*        fprintf(stderr, "WARNING:  M3_Profile, called without initializing\n");  */
        break;
      }
#ifdef USE_MPI
#ifndef USE_PAPI
      sprintf( pcontrolID, "%i", sectionID);
      MPI_Pcontrol( -1, pcontrolID );
#endif
#endif
      staticNumCalls[sectionID]++;
#ifdef USE_MPI
      staticTotalTime[sectionID] += MPI_Wtime() - staticStartTime[sectionID];
#else
      gettimeofday(&tic, &tz);
      staticTotalTime[sectionID] += (tic.tv_sec + tic.tv_usec*1e-6) - staticStartTime[sectionID];
#endif
#ifdef USE_PAPI
      PAPI_accum_counters(papiCounters, M3_NUM_PAPI_EVENTS );
      staticFlopCount[sectionID] += papiCounters[0] - staticFlopCounter[sectionID];
      staticFlipCount[sectionID] += papiCounters[1] - staticFlipCounter[sectionID];
#endif
      break;
  }
}
Beispiel #30
0
int run(void *_p, unsigned long long *data, int *data_len) {
  struct elim_params *p = (struct elim_params *)_p;
#ifndef HAVE_LIBPAPI
  *data_len = 2;
#else
  *data_len = MIN(papi_array_len + 1, *data_len);
#endif
  int papi_res;

  mzd_t *A = mzd_init(p->m, p->n);

  if(p->r != 0) {
    mzd_t *L, *U;
    L = mzd_init(p->m, p->m);
    U = mzd_init(p->m, p->n);
    mzd_randomize(U);
    mzd_randomize(L);
    for (rci_t i = 0; i < p->m; ++i) {

      for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) {
        int const length = MIN(m4ri_radix, p->m - j);
        mzd_clear_bits(L, i, j, length);
      }
      mzd_write_bit(L,i,i, 1);

      for (rci_t j = 0; j < i && j < p->n; j+=m4ri_radix) {
        int const length = MIN(m4ri_radix, i - j);
        mzd_clear_bits(U, i, j, length);
      }
      if(i < p->r) {
        mzd_write_bit(U, i, i, 1);
      } else {
        for (rci_t j = i; j < p->n; j+=m4ri_radix) {
          int const length = MIN(m4ri_radix, p->n - i);
          mzd_clear_bits(U, i, j, length);
        }
      }
    }
    mzd_mul(A,L,U,0);
    mzd_free(L);
    mzd_free(U);
  } else {
    mzd_randomize(A);
  }

  mzp_t *P = mzp_init(A->nrows);
  mzp_t *Q = mzp_init(A->ncols);

#ifndef HAVE_LIBPAPI
  data[0] = walltime(0);
  data[1] = cpucycles();
#else
  int array_len = *data_len - 1;
  unsigned long long t0 = PAPI_get_virt_usec();
  papi_res = PAPI_start_counters((int*)papi_events, array_len);
  if (papi_res)
    m4ri_die("");
#endif
  if(strcmp(p->algorithm, "m4ri") == 0)
    p->r = mzd_echelonize_m4ri(A, 0, 0);
  else if(strcmp(p->algorithm, "ple") == 0)
    p->r = mzd_ple(A, P, Q, 0);
  else if(strcmp(p->algorithm, "mmpf") == 0)
    p->r = _mzd_ple_russian(A, P, Q, 0);
  else
    m4ri_die("unknown algorithm %s",p->algorithm);
#ifndef HAVE_LIBPAPI
  data[1] = cpucycles() - data[1];
  data[0] = walltime(data[0]);
#else
  mzp_free(P);
  mzp_free(Q);

  PAPI_stop_counters((long long*)&data[1], array_len);
  t0 = PAPI_get_virt_usec() - t0;
  data[0] = t0;
  for (int nv = 0; nv <= array_len; ++nv) {
    data[nv] -= loop_calibration[nv];
  }
#endif
  mzd_free(A);
  return 0;
}