// // This method should be placed at the start of instrumented code // void startPapiCounters(){ initializeCounters(0); #ifdef DBG printGEvents(); printf("********* STARTING COUNTERS *************\n"); //assert(NUM_EVENTS == _G_EVENT_COUNT); #endif // initialize papi library and assert that it's successful _CALL_PAPI(PAPI_library_init( PAPI_VER_CURRENT )); // check that all the events can be counted at once. int numCounters = PAPI_num_counters() ; assert( NUM_EVENTS <= numCounters ); #ifdef DBG printf("Number of hardware counters available on this machine: %d", numCounters); #endif for ( int i = 0; i < NUM_EVENTS; i++ ) { char name[PAPI_MAX_STR_LEN]; (void) _CALL_PAPI(PAPI_event_code_to_name( _G_EVENTS[i], name )); if(PAPI_query_event( _G_EVENTS[i] ) < PAPI_OK) { fprintf(stderr, "Event %s could not be counted on this machine.\n", name); abort(); } } //******* Start Counters ****** (void) _CALL_PAPI(PAPI_start_counters(_G_EVENTS, NUM_EVENTS)); }
int main(int argc, char **argv) { int m = atoi(argv[1]); int k = atoi(argv[2]); int n = atoi(argv[3]); float *A = (float*) malloc(m * k * sizeof(float)); float *B = (float*) malloc(k * n * sizeof(float)); float *C = (float*) malloc(m * n * sizeof(float)); int Events[] = {PAPI_FP_INS, PAPI_TOT_CYC}; long_long values[2]; #define NUM_EVENTS 2 initialize(m, k, n, A, B, C); /* Start counting events */ if (PAPI_start_counters(Events, NUM_EVENTS) != PAPI_OK) exit(10); multiply(m, k, n, A, B, C); /* Stop counting events */ if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK) exit(10); printf("Counter values: %ld, %ld\n", values[0], values[1]); free(A); free(B); free(C); return 0; }
int main(int argc, char** argv) { int Events[NUM_EVENTS]; const char* names[NUM_EVENTS] = {"OPEN_CALLS", "OPEN_FDS", "READ_CALLS", "READ_BYTES", "READ_USEC", "READ_ERR", "READ_INTERRUPTED", "READ_WOULD_BLOCK", "WRITE_CALLS","WRITE_BYTES","WRITE_USEC", "WRITE_WOULD_BLOCK"}; long long values[NUM_EVENTS]; /* Set TESTS_QUIET variable */ tests_quiet( argc, argv ); int version = PAPI_library_init (PAPI_VER_CURRENT); if (version != PAPI_VER_CURRENT) { fprintf(stderr, "PAPI_library_init version mismatch\n"); exit(1); } if (!TESTS_QUIET) fprintf(stderr, "This program will read from stdin and echo it to stdout\n"); int retval; int e; for (e=0; e<NUM_EVENTS; e++) { retval = PAPI_event_name_to_code((char*)names[e], &Events[e]); if (retval != PAPI_OK) { fprintf(stderr, "Error getting code for %s\n", names[e]); exit(2); } } /* Start counting events */ if (PAPI_start_counters(Events, NUM_EVENTS) != PAPI_OK) { fprintf(stderr, "Error in PAPI_start_counters\n"); exit(1); } int bytes = 0; char buf[1024]; //if (PAPI_read_counters(values, NUM_EVENTS) != PAPI_OK) // handle_error(1); //printf("After reading the counters: %lld\n",values[0]); while ((bytes = read(0, buf, 1024)) > 0) { write(1, buf, bytes); } /* Stop counting events */ if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK) { fprintf(stderr, "Error in PAPI_stop_counters\n"); } if (!TESTS_QUIET) { printf("----\n"); for (e=0; e<NUM_EVENTS; e++) printf("%s: %lld\n", names[e], values[e]); } test_pass( __FILE__, NULL, 0 ); return 0; }
void papi_base::start( ) { std::vector<int> eventsMutable( counters_.data(), counters_.data() + counters_.size() ); int retval = PAPI_start_counters( &eventsMutable[ 0 ], counters_.size() ); if (retval == PAPI_OK) { papi_started_ = true; } else { std::cerr << "PAPI error " << retval << ": " << PAPI_strerror( retval ) << std::endl; papi_started_ = false; } }
int main(int argc, char **argv) { int retval; retval = PAPI_library_init(PAPI_VER_CURRENT); if (retval != PAPI_VER_CURRENT) { fprintf(stderr,"Error! PAPI_library_init %d\n", retval); } retval = PAPI_query_event(PAPI_TOT_INS); if (retval != PAPI_OK) { fprintf(stderr,"PAPI_TOT_INS not supported\n"); exit(1); } int i; int events[1],result; long long counts[1]; long long total=0,average,max=0,min=0x7ffffffffffffffULL; events[0]=PAPI_TOT_INS; PAPI_start_counters(events,1); for(i=0;i<NUM_RUNS;i++) { result=instructions_million(); PAPI_read_counters(counts,1); results[i]=counts[0]; } PAPI_stop_counters(counts,1); PAPI_shutdown(); for(i=0;i<NUM_RUNS;i++) { total+=results[i]; if (results[i]>max) max=results[i]; if (results[i]<min) min=results[i]; } average=total/NUM_RUNS; printf("Average=%lld max=%lld min=%lld\n",average,max,min); (void) result; return 0; }
void my_papi_start(int *events, int NUM_EVENTS) { values = (long long *)malloc(sizeof(long long)*NUM_EVENTS); int ret; /* Start counting events */ if ((ret = PAPI_start_counters(events, NUM_EVENTS)) != PAPI_OK) { fprintf(stderr, "PAPI failed to start counters: %s\n", PAPI_strerror(ret)); // fprintf(stderr, "PAPI_start_counters - FAILED\n"); exit(1); } }
int main (int argc, char *argv[]) { int i, count; int *array = (int*) malloc (SIZE * sizeof(int)); uint64_t start, end; int events[3] = { PAPI_L1_DCM, PAPI_L2_DCM, PAPI_L3_DCM }; long long misses[3]; int papilevels = 3; if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) { exit(1); } //Initialization count = 0; srand(time(NULL)); init_time(); for (i = 0; i < SIZE; i++) array[i] = rand(); // Uncomment this line to qsort(array, SIZE, sizeof (int), compare_ints); //Measurement while (PAPI_start_counters(events, papilevels) != PAPI_OK) { papilevels--; } start = get_time(); /* * É possível, em um vetor ordenado, fazer a contagem * em tempo O(lg(n)) em vez de O(n) utilizando busca * binária. Embora isto diminua sensivelmente o tempo * este não é o intuito deste exercício. * */ for (i = 0; i < SIZE; i++) if (array[i] < RAND_MAX / 2) count++; end = get_time(); uint64_t exec_time = diff_time(start, end); if (PAPI_read_counters(misses, papilevels) != PAPI_OK) { fprintf(stderr, "Erro em PAPI_read_counters\n"); exit(1); } printf("Time: %" PRIu64 " Count %d\n", exec_time, count); for (i = 0; i < papilevels; i++) { printf("Cache misses (L%d): %lld\n", i+1, misses[i]); } free(array); return 0; }
void bi_getinfo(bi_info* infostruct){ int i, l; char buf[200], *s; int events[10]; init_global_vars(); /*infostruct->kernelstring=bi_strdup("Random Memory Access");*/ infostruct->codesequence=bi_strdup("for i=1,N# var=memory[random(0..size)]#"); infostruct->xaxistext=bi_strdup("Accessed Memory in Byte"); infostruct->numfunctions= 1+ NUM_COUNTERS; infostruct->maxproblemsize=nMeasurements; infostruct->outlier_direction_upwards=malloc(infostruct->numfunctions*sizeof(int)); for (i=0; i< infostruct->numfunctions; i++) infostruct->outlier_direction_upwards[i]=1; infostruct->log_xaxis=1; infostruct->base_xaxis=2.0; infostruct->log_yaxis=malloc(infostruct->numfunctions*sizeof(int)); infostruct->log_yaxis[0]=0; infostruct->base_yaxis=malloc(infostruct->numfunctions*sizeof(double)); infostruct->base_yaxis[0]=0.0; infostruct->legendtexts=malloc(infostruct->numfunctions*sizeof(char*)); infostruct->legendtexts[0]=bi_strdup("Average Access Time"); if ( PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) { printf("PAPI library init error!\n"); exit(127); } for (i=0; i< NUM_COUNTERS; i++) { PAPI_event_name_to_code(sCounters[i],&(events[i])); infostruct->legendtexts[i+1] = sCounters[i]; printf("%s -> %lx\n", sCounters[i],events[i]); } if (PAPI_start_counters(events, NUM_COUNTERS) != PAPI_OK) { printf("PAPI error: start_counters"); exit(127); } infostruct->yaxistexts=malloc(infostruct->numfunctions*sizeof(char*)); if (infostruct->legendtexts==0){ printf("No more core\n"); exit(127); } infostruct->yaxistexts[0]=bi_strdup("s"); infostruct->yaxistexts[1]=bi_strdup(""); }
int test_start() { res_file = fopen(res_file_name, "w"); if (res_file == NULL) { printf("Error opening file %s\n", res_file_name); return -1; } if ( PAPI_start_counters( Events, 2 ) != PAPI_OK ) return -1; else return 0; }
void *ThreadIO(void *arg) { unsigned long tid = (unsigned long)pthread_self(); if (!TESTS_QUIET) printf("\nThread %#lx: will read %s and write it to /dev/null\n", tid,(const char*) arg); int Events[NUM_EVENTS]; long long values[NUM_EVENTS]; int retval; int e; for (e=0; e<NUM_EVENTS; e++) { retval = PAPI_event_name_to_code((char*)names[e], &Events[e]); if (retval != PAPI_OK) { fprintf(stderr, "Error getting code for %s\n", names[e]); exit(2); } } /* Start counting events */ if (PAPI_start_counters(Events, NUM_EVENTS) != PAPI_OK) { fprintf(stderr, "Error in PAPI_start_counters\n"); exit(1); } //if (PAPI_read_counters(values, NUM_EVENTS) != PAPI_OK) // handle_error(1); //printf("After reading the counters: %lld\n",values[0]); int fdin = open((const char*)arg, O_RDONLY); if (fdin < 0) perror("Could not open file for reading: \n"); int bytes = 0; char buf[1024]; int fdout = open("/dev/null", O_WRONLY); if (fdout < 0) perror("Could not open /dev/null for writing: \n"); while ((bytes = read(fdin, buf, 1024)) > 0) { write(fdout, buf, bytes); } close(fdout); /* Stop counting events */ if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK) { fprintf(stderr, "Error in PAPI_stop_counters\n"); } if (!TESTS_QUIET) { for (e=0; e<NUM_EVENTS; e++) printf("Thread %#lx: %s: %lld\n", tid, names[e], values[e]); } return(NULL); }
int main(int argc, char *argv[]) { double a[MAXVSIZE], b[MAXVSIZE], c[MAXVSIZE]; int i,n; long long before, after; if (PAPI_VER_CURRENT != PAPI_library_init(PAPI_VER_CURRENT)) ehandler("PAPI_library_init error."); const size_t EVENT_MAX = PAPI_num_counters(); printf("# Max counters = %zd\n", EVENT_MAX); if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS)) ehandler("Cannot count PAPI_TOT_INS."); if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS)) ehandler("Cannot count PAPI_FP_OPS."); size_t EVENT_COUNT = 2; int events[] = { PAPI_TOT_INS, PAPI_FP_OPS }; long long values[EVENT_COUNT]; printf("Enter vector size: "); scanf("%d",&n); for (i=0;i<n;i++) { a[i] = i; b[i] = n-i; } PAPI_start_counters(events, EVENT_COUNT); if(PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); loop(c,a,b,n); if(PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); printf("Number of instructions = %lld\n",values[0]); printf("Number of fp operations = %lld\n",values[1]); return 0; }
int main () { float t0, t1; int iter, i, j; int events[2] = {PAPI_L1_DCM, PAPI_FP_OPS }, ret; long_long values[2]; if (PAPI_num_counters() < 2) { fprintf(stderr, "No hardware counters here, or PAPI not supported.\n"); exit(1); } for (i = 0; i < MX; i++) { if ((ad[i] = malloc(sizeof(double)*MX)) == NULL) { fprintf(stderr,"malloc failed\n"); exit(1); } } for (j = 0; j < MX; j++) { for (i = 0; i < MX; i++) { ad[i][j] = 1.0/3.0; /* Initialize the data */ } } t0 = gettime(); if ((ret = PAPI_start_counters(events, 2)) != PAPI_OK) { fprintf(stderr, "PAPI failed to start counters: %s\n", PAPI_strerror(ret)); exit(1); } for (iter = 0; iter < NITER; iter++) { for (j = 0; j < MX; j++) { for (i = 0; i < MX; i++) { ad[i][j] += ad[i][j] * 3.0; } } } if ((ret = PAPI_read_counters(values, 2)) != PAPI_OK) { fprintf(stderr, "PAPI failed to read counters: %s\n", PAPI_strerror(ret)); exit(1); } t1 = gettime(); printf("Total software flops = %f\n",(float)TOT_FLOPS); printf("Total hardware flops = %lld\n",(float)values[1]); printf("MFlop/s = %f\n", (float)(TOT_FLOPS/MEGA)/(t1-t0)); printf("L1 data cache misses is %lld\n", values[0]); }
JNIEXPORT jint JNICALL Java_papi_Wrapper_startCounters (JNIEnv *env, jobject UNUSED_ARG(self), jintArray eventsarr) { if (eventsarr == NULL) { return PAPI_EINVAL; } int events_count = (*env)->GetArrayLength(env, eventsarr); if (events_count == 0) { return PAPI_EINVAL; } jint *eventsj = (*env)->GetIntArrayElements(env, eventsarr, NULL); int *events = (int *) eventsj; int rc = PAPI_start_counters(events, events_count); (*env)->ReleaseIntArrayElements(env, eventsarr, eventsj, JNI_ABORT); return rc; }
int main() { //this will fail if some counters can't be accessed if (PAPI_start_counters(papi_events, n_papi_events) != PAPI_OK) { printf("failed to start papi\n"); return 1; } doWork(123); if (PAPI_read_counters(papi_values[0], n_papi_events) != PAPI_OK) { printf("failed to read countess\n"); return 1; } printf("counters' values: misses = %d, accesses = %d\n", papi_values[0][0], papi_values[0][1]); return 0; }
int main(int argc, char **argv) { int events[1]; long long counts[1]; int retval,quiet; char test_string[]="Testing PAPI_SYC_INS predefined event..."; quiet=test_quiet(); retval = PAPI_library_init(PAPI_VER_CURRENT); if (retval != PAPI_VER_CURRENT) { if (!quiet) printf("Error! PAPI_library_init %d\n",retval); test_fail(test_string); } retval = PAPI_query_event(PAPI_SYC_INS); if (retval != PAPI_OK) { if (!quiet) printf("PAPI_SYC_INS not available\n"); test_skip(test_string); } events[0]=PAPI_SYC_INS; PAPI_start_counters(events,1); PAPI_stop_counters(counts,1); if (counts[0]<1) { if (!quiet) printf("Error! Count too low\n"); test_fail(test_string); } PAPI_shutdown(); test_unimplemented(test_string); return 0; }
static void start_sssp(FibHeap<size_t, size_t> *pq, vertex_t *graph) { #ifdef PAPI if (PAPI_OK != PAPI_start_counters(g_events, G_EVENT_COUNT)) { std::cout << ("Problem starting counters 1.\n"); } #endif while (!pq->empty()) { size_t distance; size_t node; pq->pop(distance, node); vertex_t *v = &graph[node]; size_t v_dist = v->distance; for (size_t i = 0; i < v->num_edges; i++) { const edge_t *e = &v->edges[i]; const size_t new_dist = v_dist + e->weight; vertex_t *w = &graph[e->target]; size_t w_dist = w->distance; if (new_dist < w_dist) { w->distance = new_dist; if (w->n == NULL) { w->n = pq->push(new_dist, e->target); } else { pq->decrease_key(w->n, new_dist); } } } } #ifdef PAPI if (PAPI_OK != PAPI_read_counters(g_values[0], G_EVENT_COUNT)) { std::cout << ("Problem reading counters 2.\n"); } #endif }
void papi_set_events(char *metric) { const size_t n = 1; int max; long_long *papi_tmp; int papi_events[1]; int code; max = PAPI_num_counters(); if (n > max) papi_eprintf("Too many counters requested.\n"); papi_tmp = malloc(sizeof(*papi_tmp) * n); PAPI_reset(max); PAPI_stop_counters(papi_tmp, n); if (PAPI_event_name_to_code(metric, &code) != PAPI_OK) papi_eprintf("Unknown PAPI event %s.\n", metric); if (code == 0) papi_eprintf("Unknown PAPI event %s.\n", metric); papi_events[0] = code; PAPI_start_counters(papi_events, n); if (PAPI_read_counters(papi_tmp, n) != PAPI_OK) papi_eprintf("Problem reading counters %s:%d.\n", __FILE__, __LINE__); free(papi_tmp); }
int main(int argc, char **argv) { int retval,quiet,result; int num_runs=100; long long high=0,low=0,average=0,expected=1000000; double error; int num_random_branches=500000; int i; int events[1]; long long counts[1],total=0; char test_string[]="Testing PAPI_BR_PRC predefined event..."; quiet=test_quiet(); retval = PAPI_library_init(PAPI_VER_CURRENT); if (retval != PAPI_VER_CURRENT) { if (!quiet) printf("Error: PAPI_library_init %d\n", retval); test_fail(test_string); } retval = PAPI_query_event(PAPI_BR_PRC); if (retval != PAPI_OK) { if (!quiet) printf("PAPI_BR_PRC not supported %d\n", retval); test_skip(test_string); } if (!quiet) { printf("\n"); printf("Testing a simple loop with %lld branches (%d times):\n", expected,num_runs); printf("Nearly all the branches should be predicted correctly.\n"); } events[0]=PAPI_BR_PRC; high=0; low=0; for(i=0;i<num_runs;i++) { PAPI_start_counters(events,1); result=branches_testcode(); PAPI_stop_counters(counts,1); if (result==CODE_UNIMPLEMENTED) { if (!quiet) printf("\tNo test code for this architecture\n"); test_skip(test_string); } if (counts[0]>high) high=counts[0]; if ((low==0) || (counts[0]<low)) low=counts[0]; total+=counts[0]; } average=total/num_runs; error=display_error(average,high,low,expected,quiet); if ((error > 1.0) || (error<-1.0)) { if (!quiet) printf("Instruction count off by more than 1%%\n"); test_fail(test_string); } if (!quiet) printf("\n"); /*******************/ high=0; low=0; total=0; events[0]=PAPI_BR_CN; for(i=0;i<num_runs;i++) { PAPI_start_counters(events,1); result=random_branches_testcode(num_random_branches,1); PAPI_stop_counters(counts,1); if (counts[0]>high) high=counts[0]; if ((low==0) || (counts[0]<low)) low=counts[0]; total+=counts[0]; } average=total/num_runs; expected=average; if (!quiet) { printf("\nTesting a function that branches based on a random number\n"); printf(" The loop has %lld conditional branches.\n",expected); printf(" %d are random branches; %d of those were taken\n",num_random_branches,result); } high=0; low=0; total=0; events[0]=PAPI_BR_PRC; for(i=0;i<num_runs;i++) { PAPI_start_counters(events,1); result=random_branches_testcode(num_random_branches,1); PAPI_stop_counters(counts,1); if (counts[0]>high) high=counts[0]; if ((low==0) || (counts[0]<low)) low=counts[0]; total+=counts[0]; } average=total/num_runs; if (!quiet) { printf("\nOut of %lld branches, %lld predicted correctly\n",expected,average); printf("Assuming a good random number generator and no freaky luck\n"); printf("The TOTAL - CORRECT value is %lld\n",expected-average); printf("This value should be roughly between %d and %d\n", num_random_branches/4,(num_random_branches/4)*3); } if ( (expected-average) < (num_random_branches/4)) { if (!quiet) printf("Correct predicts too low\n"); test_fail(test_string); } if ( (expected-average) > (num_random_branches/4)*3) { if (!quiet) printf("Correct predicts too high\n"); test_fail(test_string); } if (!quiet) printf("\n"); PAPI_shutdown(); test_pass(test_string); return 0; }
int main() { int retval; int i,j; int EventSet = PAPI_NULL; long long totales[EVENT_COUNT], totalesPerm[EVENT_COUNT]; int events[] = {PAPI_L1_DCM, PAPI_L1_DCH, PAPI_L1_DCA, PAPI_L2_DCH, PAPI_L2_DCA}; long long values[EVENT_COUNT]; // Inicializamos la librería PAPI retval = PAPI_library_init(PAPI_VER_CURRENT); if(retval!=PAPI_VER_CURRENT){ fprintf(stderr, "PAPI library init error!\n"); exit(1); } //Comprobamos si los contadores están disponibles for(i=0; i<EVENT_COUNT; i++) { if (PAPI_OK != PAPI_query_event(events[i])) { printf("Cannot count counter %d", i); exit(0); } } //iniciamos los vectores de resultados totales for(i=0; i<EVENT_COUNT; i++) { totales[i]=0; totalesPerm[i]=0; } //iniciarMatrizB(); printf("\n -------- Prueba con bucle original ---------\n\n"); for(i=0; i<TEST_NUM; i++) { //Iniciamos la cuenta de eventos if (PAPI_start_counters(events, EVENT_COUNT) != PAPI_OK) { fprintf(stderr, "ERROR Starting counters!\n"); exit(1); } bucle(); //Leemos el valor de un contador: if (PAPI_stop_counters(values, EVENT_COUNT) != PAPI_OK) { fprintf(stderr, "ERROR Reading counters!\n"); exit(1); } for(j=0; j<EVENT_COUNT; j++) { totales[j]+=values[j]; } printf("Prueba %d:\n\tL1 -> Accesos: %lld Aciertos: %lld Fallos: %lld\n", i, values[2], values[1], values[0]); printf("\tL2 -> Accesos: %lld Aciertos: %lld\n", values[4], values[3]); } //Calculamos los valores medios: for(i=0; i<EVENT_COUNT; i++) { totales[i] = totales[i]/TEST_NUM; } printf("\nValores medios:\n"); printf("\tCaché L1:\n\t\tAccesos: %lld \n\t\tAciertos: %lld \n\t\tFallos: %lld \n\t\tPorcentaje de acierto: %lld\n", totales[2], totales[1], totales[0], 100*totales[1]/totales[2]); printf("\tCaché L2:\n\t\tAccesos: %lld \n\t\tAciertos: %lld \n\t\tPorcentaje de acierto: %lld\n", totales[4], totales[3], 100*totales[3]/(totales[4])); printf("\n -------- Prueba con bucle permutado --------- \n\n"); for(i=0; i<TEST_NUM; i++) { //Iniciamos la cuenta de eventos if (PAPI_start_counters(events, EVENT_COUNT) != PAPI_OK) { fprintf(stderr, "ERROR Starting counters!\n"); exit(1); } buclePermutado(); //Leemos el valor de un contador: if (PAPI_stop_counters(values, EVENT_COUNT) != PAPI_OK) { fprintf(stderr, "ERROR Reading counters!\n"); exit(1); } for(j=0; j<EVENT_COUNT; j++) { totalesPerm[j]+=values[j]; } printf("Prueba %d:\n\tL1 -> Accesos: %lld Aciertos: %lld Fallos: %lld\n", i, values[2], values[1], values[0]); printf("\tL2 -> Accesos: %lld Aciertos: %lld\n", values[4], values[3]); } //Calculamos los valores medios: for(i=0; i<EVENT_COUNT; i++) { totalesPerm[i] = totalesPerm[i]/TEST_NUM; } printf("\nValores medios:\n"); printf("\tCaché L1:\n\t\tAccesos: %lld \n\t\tAciertos: %lld \n\t\tFallos: %lld \n\t\tPorcentaje de acierto: %lld\n", totalesPerm[2], totalesPerm[1], totalesPerm[0], 100*totalesPerm[1]/totalesPerm[2]); printf("\tCaché L2:\n\t\tAccesos: %lld \n\t\tAciertos: %lld \n\t\tPorcentaje de acierto: %lld\n", totalesPerm[4], totalesPerm[3], 100*totalesPerm[3]/(+totalesPerm[4])); return 0; }
void* Thread(void *userData) { ThreadInfo *info = (ThreadInfo*) userData; Context *c = info->c; int index = info->index; int threadCount = c->threadCount; int64_t repetitionCount = c->repetitionCount; uint64_t me = 0x1 << index; uint64_t full = 0x0000000000000000; uint64_t copy; //thread local copy of the entry/exit barrier for (int i = 0; i < threadCount; ++i) { full |= 0x1 << i; } // set thread affinity cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(index, &cpuset); assert(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) == 0); //DEBUG //pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); //printf("%i uses cpus: ", index); //for (int i = 0; i < threadCount; ++i) { // if (CPU_ISSET(i, &cpuset)) { // printf("%i, ", i); // } //} //printf("\n"); int threadToBeRecorded = -1; int papiEvents[3] = {0x8000003b, 0x80000000, 0x80000002}; long long papiStart[3] = {0, 0, 0}; long long papiEnd[3] = {0, 0, 0}; if (index == threadToBeRecorded) { int ret = PAPI_start_counters(papiEvents, 3); if (ret != 0) { printf("thread %i: PAPI_start_counters %i\n", index, ret); assert(0); } ret = PAPI_read_counters(papiStart, 3); if (ret != 0) { printf("thread %i: PAPI_read_counters %i\n", index, ret); assert(0); } } //unlink("a"); //FILE *log = fopen("a", "a"); for(int64_t repetition = 0; repetition < repetitionCount; repetition++){ if (c->left == 0) { /* *** if () { UNIFIED ENTRY *********************/ /* run to wall and wait busily */ do { copy = c->entry; //fprintf(log, "%i r %lli\n", prime, (long long) copy); //fflush(log); if ((copy & me) == 0) { copy |= me; c->entry = copy; //fprintf(log, "%i w %lli\n", prime, (long long) copy); //fflush(log); } }while (copy != full && c->left == 0); c->left = 1; c->exit = 0x0000000000000000; } else if (c->left == 1) { /* *** } else if () { UNIFIED ENTRY *******/ for (int i = 0; i < threadCount - 1; ++i) { if (c->successfulBarrierVisitsCount[i] != c->successfulBarrierVisitsCount[i+1]) { printf("thread %i and %i are not equal at %lli %lli\n", i, i+1, (long long)c->successfulBarrierVisitsCount[i], (long long)c->successfulBarrierVisitsCount[i+1]); ++c->outOfSyncCount; assert(0); } } /* wait busily until everyone has left the barrier */ do { copy = c->exit; if ((copy & me) == 0) { copy |= me; c->exit = copy; } }while (copy != full && c->left == 1); c->left = 0; c->entry = 0x0000000000000000; ++(c->successfulBarrierVisitsCount[index]); } /* *** } UNIFIED ENTRY *********************************************/ } if (index == threadToBeRecorded) { int ret = PAPI_stop_counters(papiEnd, 3); if (ret != 0) { printf("%i: PAPI_stop_counters %i\n", index, ret); assert(0); } printf("thread %i: papi counter 0: %lli - %lli = %lli\n", index, papiEnd[0], papiStart[0], papiEnd[0] - papiStart[0]); printf("thread %i: papi counter 1: %lli - %lli = %lli\n", index, papiEnd[1], papiStart[1], papiEnd[1] - papiStart[1]); printf("thread %i: papi counter 2: %lli - %lli = %lli\n", index, papiEnd[2], papiStart[2], papiEnd[2] - papiStart[2]); printf("\n"); } return NULL; }
void* sssp(void *data) { thread_data_t *d = (thread_data_t *)data; /* Create transaction */ set_cpu(the_cores[d->id]); /* Wait on barrier */ ssalloc_init(); PF_CORRECTION; seeds = seed_rand(); #ifdef PIN int id = d->id; // int cpu = 40*(id/40) + 4*(id%10) + (id%40)/10; int cpu = 4*(id%20) + id/20; // printf("Pinning %d to %d\n",id,cpu); pin(pthread_self(), cpu); // pin(pthread_self(), id); #endif #ifdef PAPI if (PAPI_OK != PAPI_start_counters(g_events, G_EVENT_COUNT)) { printf("Problem starting counters 1."); } #endif barrier_cross(d->barrier); // Begin SSSP int fail = 0; // int radius = 0; while (1) { val_t node; slkey_t dist_node; // print_skiplist(d->set); while (1) { if (d->sl) { if (spray_delete_min_key(d->set, &dist_node, &node, d)) break; // keep trying until get a node } else if (d->pq) { if (lotan_shavit_delete_min_key(d->set, &dist_node, &node, d)) break; } else if (d->lin) { node = (val_t) deletemin_key(d->linden_set, &dist_node, d); break; } else { printf("error: no queue selected\n"); exit(1); // TODO: grace } if (dist_node == -1) { // flag that list is empty break; } dist_node = 0; } if (dist_node == -1) { // list is empty; TODO make sure threads don't quit early fail++; if (fail > 20*d->nb_threads) { // TODO: really need a better break condition... break; } continue; } fail = 0; if (dist_node != nodes[node].dist) continue; // dead node nodes[node].times_processed++; int i; for (i = 0;i < nodes[node].deg;i++) { int v = nodes[node].adj[i]; int w = nodes[node].weights[i]; slkey_t dist_v = nodes[v].dist; // printf("v=%d dist_v=%d\n", v, dist_v); if (dist_v == -1 || dist_node + w < dist_v) { // found better path to v // printf("attempting cas...\n"); // printf("nodes[v].dist=%d dist_v=%d dist_node=%d\n", nodes[v].dist, dist_v, dist_node); int res = ATOMIC_CAS_MB(&nodes[v].dist, dist_v, dist_node+w); // printf("%d nodes[%d].dist=%d\n", res, v, nodes[v].dist); if (res) { if (d->pq || d->sl) { sl_add_val(d->set, dist_node+w, v, TRANSACTIONAL); // add to queue only if CAS is successful } else if (d->lin) { insert(d->linden_set, dist_node+w, v); } d->nb_add++; // if (dist_node+1 > radius) { // radius = dist_node+1; // printf("radius %d\n", radius); // } } } } } // End SSSP #ifdef PAPI if (PAPI_OK != PAPI_read_counters(g_values[d->id], G_EVENT_COUNT)) { printf("Problem reading counters 2."); } #endif PF_PRINT; return NULL; }
int main(int argc, char *argv[]) { int size, rank, world_rank, my_group; int num_lsms; // number of parallel LSMS instances int size_lsms; // number of atoms in a lsms instance int num_steps; // number of energy calculations int initial_steps; // number of steps before sampling starts int stepCount=0; // count the Monte Carlo steps executed double max_time; // maximum walltime for this run in seconds bool restrict_time = false; // was the maximum time specified? bool restrict_steps = false; // or the max. numer of steps? int align; // alignment of lsms_instances double magnetization; double energy_accumulator; // accumulates the enegy to calculate the mean int energies_accumulated; int new_peid,new_root; static int op,flag; double *evec,*r_values; evec=(double *)shmalloc(sizeof(double)*3*size_lsms); r_values=(double *)shmalloc(sizeof(double)*(R_VALUE_OFFSET+3*(size_lsms+1))); energy_accumulator=0.0; energies_accumulated=0; double walltime_0,walltime; double restartWriteFrequency=30.0*60.0; double nextWriteTime=restartWriteFrequency; MPI_Comm local_comm; int *lsms_rank0; MPI_Status status; char prefix[40]; char i_lsms_name[64]; char gWL_in_name[64], gWL_out_name[64]; char mode_name[64]; char energy_calculation_name[64]; char stupid[37]; char step_out_name[64]; char wl_step_out_name[128]; char *wl_stepf=NULL; bool step_out_flag=false; std::ofstream step_out_file; typedef enum {Constant, Random, WangLandau_1d, ExhaustiveIsing, WangLandau_2d} EvecGenerationMode; typedef enum {MagneticMoment, MagneticMomentZ, MagneticMomentX, MagneticMomentY} SecondDimension; EvecGenerationMode evec_generation_mode = Constant; SecondDimension second_dimension = MagneticMoment; double ev0[3]; bool return_moments_flag=true; // true-> return all magnetic moments from lsms run at each step. bool generator_needs_moment=false; typedef enum {OneStepEnergy, MultiStepEnergy, ScfEnergy} EnergyCalculationMode; EnergyCalculationMode energyCalculationMode = OneStepEnergy; int energyIndex=1; // index for the return value to use for the MC step (0: total energy, 1: band energy) ev0[0]=ev0[1]=0.0; ev0[2]=1.0; // size has to be align + size_lsms*num_lsms align=1; num_lsms=1; size_lsms=-1; my_group=-1; num_steps=1; initial_steps=0; sprintf(i_lsms_name,"i_lsms"); gWL_in_name[0]=gWL_out_name[0]=0; mode_name[0]=0; energy_calculation_name[0]=0; // check command line arguments for(int i=0; i<argc; i++) { if(!strcmp("-num_lsms",argv[i])) num_lsms=atoi(argv[++i]); if(!strcmp("-size_lsms",argv[i])) size_lsms=atoi(argv[++i]); if(!strcmp("-align",argv[i])) align=atoi(argv[++i]); if(!strcmp("-num_steps",argv[i])) {num_steps=atoi(argv[++i]); restrict_steps=true;} if(!strcmp("-initial_steps",argv[i])) initial_steps=atoi(argv[++i]); if(!strcmp("-walltime",argv[i])) {max_time=60.0*atof(argv[++i]); restrict_time=true;} if(!strcmp("-i",argv[i])) strncpy(i_lsms_name,argv[++i],64); if(!strcmp("-random_dir",argv[i])) {evec_generation_mode = Random;} if(!strcmp("-step_out",argv[i])) {strncpy(step_out_name,argv[++i],64); step_out_flag=true; return_moments_flag=true;} if(!strcmp("-wl_out", argv[i])) strncpy(gWL_out_name,argv[++i],64); if(!strcmp("-wl_in", argv[i])) strncpy(gWL_in_name,argv[++i],64); if(!strcmp("-mode", argv[i])) strncpy(mode_name,argv[++i],64); if(!strcmp("-energy_calculation",argv[i])) strncpy(energy_calculation_name,argv[++i],64); } if(!(restrict_steps || restrict_time)) restrict_steps=true; if(mode_name[0]!=0) { if(!strcmp("constant",mode_name)) evec_generation_mode = Constant; if(!strcmp("random",mode_name)) evec_generation_mode = Random; if(!strcmp("1d",mode_name)) evec_generation_mode = WangLandau_1d; if(!strcmp("ising",mode_name)) evec_generation_mode = ExhaustiveIsing; if(!strcmp("2d",mode_name)) evec_generation_mode = WangLandau_2d; if(!strcmp("2d-m",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMoment;} if(!strcmp("2d-x",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentX;} if(!strcmp("2d-y",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentY;} if(!strcmp("2d-z",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentZ;} } if(energy_calculation_name[0]!=0) { if(energy_calculation_name[0]=='o') { energyCalculationMode = OneStepEnergy; energyIndex=1; } if(energy_calculation_name[0]=='m') { energyCalculationMode = MultiStepEnergy; energyIndex=1; } if(energy_calculation_name[0]=='s') { energyCalculationMode = ScfEnergy; energyIndex=0; } } #ifdef USE_PAPI #define NUM_PAPI_EVENTS 4 int hw_counters = PAPI_num_counters(); if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS; int papi_events[NUM_PAPI_EVENTS]; // = {PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_FP_OPS,PAPI_VEC_INS}; char *papi_event_name[] = {"PAPI_TOT_INS","PAPI_FP_OPS", "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:OP_TYPE", "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:OP_TYPE"}; // "RETIRED_INSTRUCTIONS", // "RETIRED_MMX_AND_FP_INSTRUCTIONS:PACKED_SSE_AND_SSE2", // "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:1", // "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:1" // get events from names: for(int i=0; i<NUM_PAPI_EVENTS; i++) { if(PAPI_event_name_to_code(papi_event_name[i],&papi_events[i]) != PAPI_OK) { // printline("Error in obtaining PAPI event code for: "+ttos(papi_event_name[i]), // std::cerr,parameters.myrankWorld); // printline("Skipping all following events", // std::cerr,parameters.myrankWorld); if(hw_counters>i) hw_counters=i; } } long long papi_values[NUM_PAPI_EVENTS+4]; // printline("PAPI: "+ttos(hw_counters)+" counters available",std::cout,parameters.myrankWorld); if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS; long long papi_real_cyc_0 = PAPI_get_real_cyc(); long long papi_real_usec_0 = PAPI_get_real_usec(); long long papi_virt_cyc_0 = PAPI_get_virt_cyc(); long long papi_virt_usec_0 = PAPI_get_virt_usec(); PAPI_start_counters(papi_events,hw_counters); #endif lsms_rank0=(int *)malloc(sizeof(int)*(num_lsms+1)); // initialize MPI: MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); world_rank=rank; MPI_Comm_size(MPI_COMM_WORLD, &size); walltime_0 = get_rtc(); #ifndef SVN_REV #define SVN_REV "unknown" #endif // make sure 'return_moments_flag' is set correctly switch(evec_generation_mode) { case Constant : break; case Random : break; case WangLandau_1d : return_moments_flag = true; generator_needs_moment = true; break; case ExhaustiveIsing : break; case WangLandau_2d : return_moments_flag = true; generator_needs_moment = true; break; default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1); } if(rank==0) { std::cout<<"LSMS_3"<<std::endl; std::cout<<" SVN revision "<<SVN_REV<<std::endl<<std::endl; #ifdef USE_PAPI std::cout<<" Using Papi counters"<<std::endl<<std::endl; #endif std::cout<<" Size of LSMS instances = "<<size_lsms<<" atoms\n"; std::cout<<" Number of LSMS instances = "<<num_lsms<<std::endl; std::cout<<" LSMS Energy calculated using "; switch(energyCalculationMode) { case OneStepEnergy: std::cout<<"oneStepEnergy [frozen potential band energy]"<<std::endl; break; case MultiStepEnergy: std::cout<<"multiStepEnergy [frozen potential band energy with converged Fermi energy]"<<std::endl; break; case ScfEnergy: std::cout<<"scfEnergy [self-consistent total energy]"<<std::endl; break; default: std::cout<<"UNKNOWN ENERGY CALCULATION METHOD"<<std::endl; exit(1); } if(restrict_steps) std::cout<<" Number of gWL steps = "<<num_steps<<std::endl; if(restrict_time) std::cout<<" Maximum walltime = "<<max_time<<"s\n"; std::cout<<" Processor alignment (process allocation quantization) = "<<align<<std::endl; switch(evec_generation_mode) { case Constant : std::cout<<" Constant moments direction along " <<ev0[0]<<" "<<ev0[1]<<" "<<ev0[2]<<std::endl; break; case Random : std::cout<<" Random distribution of moments (no Wang-Landau)"<<std::endl; break; case WangLandau_1d : std::cout<<" Wang-Landau for one continuous variable (energy)"<<std::endl; // return_moments_flag = true; // generator_needs_moment = true; break; case ExhaustiveIsing : std::cout<<" Exhaustive Ising sampling"<<std::endl; break; case WangLandau_2d : std::cout<<" Wang-Landau for two continuous variable (energy, "; switch(second_dimension) { case MagneticMoment : std::cout<<"magnitude of magnetization)"; break; case MagneticMomentX : std::cout<<"x component of magnetization)"; break; case MagneticMomentY : std::cout<<"y component of magnetization)"; break; case MagneticMomentZ : std::cout<<"z component of magnetization)"; break; } std::cout<<std::endl; // return_moments_flag = true; // generator_needs_moment = true; break; default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1); } if(step_out_flag) std::cout<<" Step output written to: "<<step_out_name<<std::endl; std::cout<<std::endl; if(step_out_flag && (evec_generation_mode==WangLandau_1d)) { // step_out_flag=false; snprintf(wl_step_out_name,127,"wl1d_%s",step_out_name); wl_stepf=wl_step_out_name; } if(step_out_flag) { step_out_file.open(step_out_name); step_out_file<<"#"; for(int i=0; i<argc; i++) step_out_file<<" "<<argv[i]; step_out_file<<std::endl<<size_lsms<<std::endl; } } if(generator_needs_moment) return_moments_flag=true; if(num_lsms==1) { SHMEM_activeset local_comm; local_comm.rank=shmem_my_pe(); local_comm.size=shmem_n_pes(); local_comm.start_pe=0; local_comm.logPE_stride=0; LSMS lsms_calc(local_comm,i_lsms_name,"1_"); if(rank==0) { std::cout<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n"; std::cout<<" LSMS version = "<<lsms_calc.version()<<std::endl; } if(energyCalculationMode==OneStepEnergy) std::cout<<"one step Energy = "<<lsms_calc.oneStepEnergy()<<std::endl; else if(energyCalculationMode==MultiStepEnergy) std::cout<<"multi-step Energy = "<<lsms_calc.multiStepEnergy()<<std::endl; else if(energyCalculationMode==ScfEnergy) std::cout<<"self-consistent Energy = "<<lsms_calc.scfEnergy()<<std::endl; else { printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n"); // MPI_Abort(MPI_COMM_WORLD,5); exit(5); } } else { // build the communicators //int color=MPI_UNDEFINED; //Assuming user passes a power of two while using "-align" int s = align; int comm_size=(size-align)/num_lsms; int world_rank; for(int i=0; i<num_lsms; i++) { if((world_rank>=s) && (world_rank<s+comm_size)) { my_group=i; //color=i; new_peid=world_rank-s; new_root=s; } lsms_rank0[i]=s; s+=comm_size; } if(world_rank==0){ //color=num_lsms; new_peid=0; comm_size=1; new_root=0; } //MPI_Comm_split(MPI_COMM_WORLD, color, 0, &local_comm); SHMEM_activeset local_comm; local_comm.rank=new_peid; local_comm.size=comm_size; local_comm.start_pe=new_root; local_comm.logPE_stride=0; std::cout<<"world_rank="<<world_rank<<" -> group="<<my_group<<std::endl; snprintf(prefix,38,"Group %4d: ",my_group); // now we get ready to do some calculations... if(my_group>=0) { double energy; double band_energy; int static i_values[10]; double static r_values[10]; static int op; //MPI_Comm_rank(local_comm, &rank); rank = local_comm.rank; snprintf(prefix,38,"%d_",my_group); // to use the ramdisk on jaguarpf: // snprintf(prefix,38,"/tmp/ompi/%d_",my_group); LSMS lsms_calc(local_comm,i_lsms_name,prefix); snprintf(prefix,38,"Group %4d: ",my_group); if(rank==0 && my_group==0) { std::cout<<prefix<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n"; std::cout<<prefix<<" LSMS version = "<<lsms_calc.version()<<std::endl; } // wait for commands from master bool finished=false; while(!finished) { if(rank==0) { //MPI_Recv(evec,3*size_lsms,MPI_DOUBLE,0,MPI_ANY_TAG,MPI_COMM_WORLD,&status); //op =status.MPI_TAG; if (lsms_rank0[0]==world_rank) shmem_barrier(0, lsms_rank0[0], 2, pSync1); } //MPI_Bcast(&op,1,MPI_INT,0,local_comm); shmem_broadcast32(&op, &op, 1, local_comm.start_pe, local_comm.start_pe, local_comm.logPE_stride, local_comm.size, pSync2); /* recognized opcodes: 5: calculate energy recognized energy calculation modes: OneStepEnergy : calclulate frozen potential band energy in one step (don't converge Ef) use only if the Fermi energy will not change due to MC steps! The only method available in LSMS_1.9 MultiStepEnergy : calculate frozen potential band energy after converging Fermi energy This should be the new default method. If the Fermi energy doesn't change multiStepEnergy only performs one step and should be equivalent to oneStepEnergy The tolerance for Ef convergence can be set with LSMS::setEfTol(Real). The default tolerance is set in the LSMS::LSMS constructor (currently 1.0e-6). The maximum number of steps is read from the LSMS input file 'nscf' parameter. ScfEnergy : this will calculate the selfconsistent total energy. The maximum number of steps is read from the LSMS input file 'nscf' parameter. NOT IMPLEMENTED YET!!! 10: get number of sites */ if(op==5) { lsms_calc.setEvec(evec); if(energyCalculationMode==OneStepEnergy) energy=lsms_calc.oneStepEnergy(&band_energy); else if(energyCalculationMode==MultiStepEnergy) band_energy=energy=lsms_calc.multiStepEnergy(); else if(energyCalculationMode==ScfEnergy) energy=lsms_calc.scfEnergy(&band_energy); else { printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n"); //MPI_Abort(MPI_COMM_WORLD,5); exit(5); } r_values[0]=energy; r_values[1]=band_energy; if(return_moments_flag) { lsms_calc.getMag(&r_values[R_VALUE_OFFSET]); } if(rank==0) { if(return_moments_flag) { //MPI_Send(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,0,1005,MPI_COMM_WORLD); shmem_double_put(r_values, r_values, R_VALUE_OFFSET+3*size_lsms, 0); } else { //MPI_Send(r_values,R_VALUE_OFFSET,MPI_DOUBLE,0,1005,MPI_COMM_WORLD); shmem_double_put(r_values, r_values, R_VALUE_OFFSET, 0); } shmem_fence(); shmem_int_swap(&flag, world_rank, 0); } } else if(op==10) { i_values[0]=lsms_calc.numSpins(); //MPI_Send(i_values,10,MPI_INT,0,1010,MPI_COMM_WORLD); shmem_int_put(i_values, i_values, 10, 0); } else { // printf("world rank %d: recieved exit\n",world_rank); finished=true; } } shfree(evec); //shfree(r_values); } else if(world_rank==0) { int running; double **evecs; //double *r_values; //int i_values[10]; int *init_steps; int total_init_steps; bool accepted; char *wl_inf=NULL; char *wl_outf=NULL; if(gWL_in_name) wl_inf=gWL_in_name; if(gWL_out_name) wl_outf=gWL_out_name; EvecGenerator *generator; /* // get number of spins from first LSMS instance // temp r_values: r_values=(double *)malloc(sizeof(double)*10); MPI_Send(r_values,1,MPI_DOUBLE, lsms_rank0[0], 10, MPI_COMM_WORLD); free(r_values); MPI_Recv(i_values,10,MPI_INT,lsms_rank0[0],1010,MPI_COMM_WORLD,&status); if(i_values[0]!=size_lsms) { printf("Size specified for Wang-Landau and in LSMS input file don't match!\n"); size_lsms=i_values[0]; } */ evecs=(double **)shmalloc(sizeof(double *)*num_lsms); init_steps=(int *)shmalloc(sizeof(int)*num_lsms); for(int i=0; i<num_lsms; i++) { evecs[i]=(double *)shmalloc(sizeof(double)*3*size_lsms); init_steps[i]=initial_steps; } total_init_steps=num_lsms*initial_steps; // Initialize the correct evec generator switch(evec_generation_mode) { case Random : generator = new RandomEvecGenerator(size_lsms); break; case Constant: generator = new ConstantEvecGenerator(size_lsms, ev0, num_lsms); break; //case WangLandau_1d : generator = new WL1dEvecGenerator<std::mt19937>(size_lsms, num_lsms, // evecs, wl_inf, wl_outf, wl_stepf); case WangLandau_1d : generator = new WL1dEvecGenerator<boost::mt19937>(size_lsms, num_lsms, evecs, wl_inf, wl_outf, wl_stepf); break; case ExhaustiveIsing : generator = new ExhaustiveIsing1dEvecGenerator(size_lsms, num_lsms, evecs, wl_inf, wl_outf); break; //case WangLandau_2d : generator = new WL2dEvecGenerator<std::mt19937>(size_lsms, num_lsms, // evecs, wl_inf, wl_outf, wl_stepf); case WangLandau_2d : generator = new WL2dEvecGenerator<boost::mt19937>(size_lsms, num_lsms, evecs, wl_inf, wl_outf, wl_stepf); break; default: std::cerr<<"The code should never arrive here: UNKNOWN EVEC GENERATION MODE\n"; exit(1); } for(int i=0; i<num_lsms; i++) { generator->initializeEvec(i,evecs[i]); } std::cout<<"This is the master node\n"; // issue initial commands to all LSMS instances running=0; bool more_work=true; if(total_init_steps>0) { for(int i=0; i<num_lsms; i++) { std::cout<<"starting initial calculation in group "<<i<<std::endl; //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD); shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]); shmem_int_p(&op, 5, lsms_rank0[i]); shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; } shmem_barrier(0, lsms_rank0[0], 2, pSync1); // first deal with the initial steps: while(running>0) { //if(return_moments_flag) // MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status); //else // MPI_Recv(r_values,R_VALUE_OFFSET,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status); shmem_int_wait(&flag,-1); running--; // std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl; // std::cout<<" band energy E_band="<<r_values[1]<<std::endl; if(total_init_steps>0) { //int r_group=(status.MPI_SOURCE-align)/comm_size; int r_group=(flag-align)/comm_size; std::cout<<"starting additional calculation in group "<<r_group<<std::endl; if(init_steps[r_group]>0) { more_work = !(generator->generateUnsampledEvec(r_group,evecs[r_group],r_values[energyIndex])); init_steps[r_group]--; total_init_steps--; } //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD); shmem_double_put(r_values, evecs[r_group], 3*size_lsms, lsms_rank0[r_group]); //TODO check this shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps && num_steps<=0) more_work=false; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; walltime = get_rtc() - walltime_0; if(restrict_time && walltime>=max_time) more_work=false; if(restrict_time) std::cout<<" "<<max_time-walltime<<" seconds remaining\n"; } } } more_work=true; running=0; for(int i=0; i<num_lsms; i++) { std::cout<<"starting main calculation in group "<<i<<std::endl; //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD); shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]); shmem_int_p(&op, 5, lsms_rank0[i]); shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; } shmem_barrier(0, lsms_rank0[0], 2, pSync1); generator->startSampling(); // wait for results and issue new commands or wind down while(running>0) { //MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status); shmem_int_wait(&flag,-1); running--; std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl; std::cout<<" band energy E_band="<<r_values[1]<<std::endl; // printf("from status.MPI_SOURCE=%d\n",status.MPI_SOURCE); energy_accumulator+=r_values[0]; energies_accumulated++; if(more_work) { int r_group=(status.MPI_SOURCE-align)/comm_size; std::cout<<"starting additional calculation in group "<<r_group<<std::endl; if(generator_needs_moment) { double m0,m1,m2; m0=0.0; m1=0.0; m2=0.0; for(int i=0; i<3*size_lsms; i+=3) { m0+=r_values[R_VALUE_OFFSET+i]; m1+=r_values[R_VALUE_OFFSET+i+1]; m2+=r_values[R_VALUE_OFFSET+i+2]; } switch(second_dimension) { case MagneticMoment : magnetization=std::sqrt(m0*m0+m1*m1+m2*m2); break; case MagneticMomentX : magnetization=m0; break; case MagneticMomentY : magnetization=m1; break; case MagneticMomentZ : magnetization=m2; break; } if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex],magnetization, &accepted)) more_work=false; } else { if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex], &accepted)) more_work=false; } //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD); shmem_double_put(r_values, evecs[r_group], 3*size_lsms, lsms_rank0[r_group]); //TODO check this shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps && num_steps<=0) more_work=false; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; walltime = get_rtc() - walltime_0; if(restrict_time && walltime>=max_time) more_work=false; if(restrict_time) std::cout<<" "<<max_time-walltime<<" seconds remaining\n"; } else { // send an exit message to this instance of LSMS int r_group=(status.MPI_SOURCE-align)/comm_size; MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 2, MPI_COMM_WORLD); } if(step_out_flag && accepted) { step_out_file<<"# iteration "<<energies_accumulated<<std::endl; step_out_file.precision(15); step_out_file<<energies_accumulated<<std::endl; step_out_file<<r_values[0]<<" "<<r_values[1]<<std::endl; for(int j=0; j<3*size_lsms; j+=3) { step_out_file<<r_values[j+R_VALUE_OFFSET]<<" "<<r_values[j+R_VALUE_OFFSET+1] <<" "<<r_values[j+R_VALUE_OFFSET+2]<<std::endl; } } // write restart file every restartWriteFrequency seconds if(walltime>nextWriteTime) { generator->writeState("WLrestart.jsn"); nextWriteTime+=restartWriteFrequency; } } generator->writeState("WLrestart.jsn"); /* if(evec_generation_mode==WangLandau_1d) (static_cast<WL1dEvecGenerator<std::mt19937> *>(generator))->writeState("WLrestart.state"); if(evec_generation_mode==ExhaustiveIsing) (static_cast<ExhaustiveIsing1dEvecGenerator *>(generator))->writeState("WLrestart.state"); */ for(int i=0; i<num_lsms; i++) free(evecs[i]); shfree(evecs); //shfree(r_values); } } if(world_rank==0) { if(step_out_flag) { step_out_file<<"# end\n-1\n" <<energy_accumulator/double(energies_accumulated)<<std::endl; step_out_file.close(); } std::cout<<"Finished all scheduled calculations. Freeing resources.\n"; std::cout<<"Energy mean = "<<energy_accumulator/double(energies_accumulated)<<"Ry\n"; } if(num_lsms>1) { // make sure averyone arrives here: MPI_Bcast(stupid,37,MPI_CHAR,0,MPI_COMM_WORLD); if(world_rank==0) { MPI_Comm_free(&local_comm); } else if(my_group>=0) { MPI_Comm_free(&local_comm); } } if(world_rank==0) { double walltime = get_rtc() - walltime_0; std::cout<<" WL-LSMS finished in "<<walltime<<" seconds.\n"; std::cout<<" Monte-Carlo steps / walltime = " <<double(stepCount)/walltime<<"/sec\n"; } #ifdef USE_PAPI PAPI_stop_counters(papi_values,hw_counters); papi_values[hw_counters ] = PAPI_get_real_cyc()-papi_real_cyc_0; papi_values[hw_counters+1] = PAPI_get_real_usec()-papi_real_usec_0; papi_values[hw_counters+2] = PAPI_get_virt_cyc()-papi_virt_cyc_0; papi_values[hw_counters+3] = PAPI_get_virt_usec()-papi_virt_usec_0; long long accumulated_counters[NUM_PAPI_EVENTS+4]; /* for(int i=0; i<hw_counters; i++) { printline(ttos(papi_event_name[i])+" = "+ttos(papi_values[i]), std::cout,parameters.myrankWorld); } printline("PAPI real cycles : "+ttos(papi_values[hw_counters]), std::cout,parameters.myrankWorld); printline("PAPI real usecs : "+ttos(papi_values[hw_counters+1]), std::cout,parameters.myrankWorld); printline("PAPI user cycles : "+ttos(papi_values[hw_counters+2]), std::cout,parameters.myrankWorld); printline("PAPI user usecs : "+ttos(papi_values[hw_counters+3]), std::cout,parameters.myrankWorld); */ //MPI_Reduce(papi_values,accumulated_counters,hw_counters+4, // MPI_LONG,MPI_SUM,0,MPI_COMM_WORLD); shmem_long_sum_to_all(accumulated_counters, papi_values, hw_counters+4, comm.pestart, comm.logPE_stride, comm.size, pWrk_i, pSync2); if(world_rank==0) { for(int i=0; i<hw_counters; i++) { std::cout<<"Accumulated: "<<(papi_event_name[i])<<" = "<<(accumulated_counters[i])<<"\n"; } std::cout<<"PAPI accumulated real cycles : "<<(accumulated_counters[hw_counters])<<"\n"; std::cout<<"PAPI accumulated user cycles : "<<(accumulated_counters[hw_counters+2])<<"\n"; double gflops_papi = ((double)accumulated_counters[1])/ (1000.0*(double)papi_values[hw_counters+1]); double gflops_hw_double = ((double)accumulated_counters[2])/ (1000.0*(double)papi_values[hw_counters+1]); double gflops_hw_single = ((double)accumulated_counters[3])/ (1000.0*(double)papi_values[hw_counters+1]); double gips = ((double)accumulated_counters[0])/(1000.0*(double)papi_values[hw_counters+1]); std::cout<<"PAPI_FP_OPS real GFLOP/s : "<<(gflops_papi)<<"\n"; std::cout<<"PAPI hw double real GFLOP/s : "<<(gflops_hw_double)<<"\n"; std::cout<<"PAPI hw single real GFLOP/s : "<<(gflops_hw_single)<<"\n"; std::cout<<"PAPI real GINST/s : "<<(gips)<<"\n"; } #endif //MPI_Finalize(); return 0; }
void* test(void *data) { int unext, last = -1; val_t val = 0; pval_t pval = 0; thread_data_t *d = (thread_data_t *)data; /* Create transaction */ TM_THREAD_ENTER(d->id); set_cpu(the_cores[d->id]); /* Wait on barrier */ ssalloc_init(); PF_CORRECTION; seeds = seed_rand(); #ifdef PIN int id = d->id; int cpu = 40*(id/40) + 4*(id%10) + (id%40)/10; // printf("Pinning %d to %d\n",id,cpu); pin(pthread_self(), cpu); // pin(pthread_self(), id); #endif #ifdef PAPI if (PAPI_OK != PAPI_start_counters(g_events, G_EVENT_COUNT)) { printf("Problem starting counters 1."); } #endif barrier_cross(d->barrier); /* Is the first op an update? */ unext = (rand_range_re(&d->seed, 100) - 1 < d->update); #ifdef DISTRIBUTION_EXPERIMENT while (1) #else while (*running) #endif { if (d->es) { // event simulator experiment if (d->lin) { if (!empty(d->linden_set)) { d->nb_remove++; pval_t pval = deletemin(d->linden_set, d); d->nb_removed++; // printf("%d %d\n", pval, deps[pval][0]); int i = 0; val_t dep; while ((dep = deps[pval][i]) != -1 && i < MAX_DEPS) { d->nb_add++; if (insert(d->linden_set, dep, dep)) { d->nb_added++; } i++; } } } else { if (d->set->head->next[0]->next[0] != NULL) {// set not empty d->nb_remove++; if (d->sl) { // spray list if (spray_delete_min(d->set, &val, d)) { d->nb_removed++; } else { continue; } } else if (d->pq) { // lotan_shavit pq if (lotan_shavit_delete_min(d->set, &val, d)) { d->nb_removed++; // continue; // TODO: maybe try remove this to simulate task handling (dependency checks still occur) } else { continue; } } // struct timespec ten_usec; // ten_usec.tv_sec = 0; // ten_usec.tv_nsec = 10000; // nanosleep(&ten_usec, NULL); // dependency handling int i = 0; val_t dep; while ((dep = deps[val][i]) != -1 && i < MAX_DEPS) { if (!sl_contains(d->set, dep, TRANSACTIONAL)) { // dependent has been removed, need to add it again if (sl_add(d->set, dep, TRANSACTIONAL)) { // check if insert actually succeeded (otherwise someone else did it first) d->nb_added++; } d->nb_add++; } i++; } } } } else { // not event simulator if (unext) { // update if (last < 0) { // add val = rand_range_re(&d->seed, d->range); if (d->lin) { pval = val; insert(d->linden_set, pval, pval); d->nb_added++; last = pval; } else { // not linden if (sl_add(d->set, val, TRANSACTIONAL)) { d->nb_added++; last = val; } } d->nb_add++; } else { // remove if (d->pq) { if (lotan_shavit_delete_min(d->set, &val, d)) { d->nb_removed++; if (d->first_remove == -1) { d->first_remove = val; } } last = -1; } else if (d->sl) { if (spray_delete_min(d->set, &val, d)) { d->nb_removed++; if (d->first_remove == -1) { d->first_remove = val; } last = -1; } } else if (d->lin) { if ((pval = deletemin(d->linden_set, d))) { d->nb_removed++; if (d->first_remove == -1) { d->first_remove = pval; } last = -1; } } else if (d->alternate) { // alternate mode (default) if (sl_remove(d->set, last, TRANSACTIONAL)) { d->nb_removed++; if (d->first_remove == -1) { d->first_remove = val; } } last = -1; } else { /* Random computation only in non-alternated cases */ val = rand_range_re(&d->seed, d->range); /* Remove one random value */ if (sl_remove_succ(d->set, val, TRANSACTIONAL)) { d->nb_removed++; if (d->first_remove == -1) { d->first_remove = val; } /* Repeat until successful, to avoid size variations */ last = -1; } } d->nb_remove++; } } else { // read if (d->alternate) { if (d->update == 0) { if (last < 0) { val = d->first; last = val; } else { // last >= 0 val = rand_range_re(&d->seed, d->range); last = -1; } } else { // update != 0 if (last < 0) { val = rand_range_re(&d->seed, d->range); //last = val; } else { val = last; } } } else val = rand_range_re(&d->seed, d->range); PF_START(2); if (sl_contains(d->set, val, TRANSACTIONAL)) d->nb_found++; PF_STOP(2); d->nb_contains++; } /* Is the next op an update? */ if (d->effective) { // a failed remove/add is a read-only tx unext = ((100 * (d->nb_added + d->nb_removed)) < (d->update * (d->nb_add + d->nb_remove + d->nb_contains))); } else { // remove/add (even failed) is considered as an update unext = (rand_range_re(&d->seed, 100) - 1 < d->update); } } #ifdef DISTRIBUTION_EXPERIMENT if (d->first_remove != -1) { break; //only one run } #endif } #ifdef PAPI if (PAPI_OK != PAPI_read_counters(g_values[d->id], G_EVENT_COUNT)) { printf("Problem reading counters 2."); } #endif /* Free transaction */ TM_THREAD_EXIT(); PF_PRINT; return NULL; }
main(int argc, char *argv[]) { float **a,**b,**c; int n; int NB; int i,j; int x; //double t0,t1; struct timeval t0,t1; long mtime, seconds, useconds; // Using PAPI - from countloop.c if (PAPI_VER_CURRENT != PAPI_library_init(PAPI_VER_CURRENT)) ehandler("PAPI_library_init error."); const size_t EVENT_MAX = PAPI_num_counters(); // Suppressing output // printf("# Max counters = %zd\n", EVENT_MAX); if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS)) ehandler("Cannot count PAPI_TOT_INS."); if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS)) ehandler("Cannot count PAPI_FP_OPS."); if (PAPI_OK != PAPI_query_event(PAPI_L1_DCM)) ehandler("Cannot count PAPI_L1_DCM."); size_t EVENT_COUNT = 3; int events[] = { PAPI_TOT_INS, PAPI_FP_OPS, PAPI_L1_DCM }; long long values[EVENT_COUNT]; // Take size from args, not prompt // printf("Enter n: "); scanf("%d",&n); printf("n = %d\n",n); n = atoi(argv[1]); NB = atoi(argv[2]); a = matrix(1,n,1,n); for (i=1; i<=n; i++) for (j=1; j<=n; j++) a[i][j] = i+j; b = matrix(1,n,1,n); for (i=1; i<=n; i++) for (j=1; j<=n; j++) b[i][j] = i-j; //t0 = get_seconds(); gettimeofday(&t0, NULL); // Start PAPI PAPI_start_counters(events, EVENT_COUNT); if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); //for (x=0;x<1000;x++){ c = matrix_prod(n,n,n,n,a,b,NB); //} if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); //t1 = get_seconds(); gettimeofday(&t1, NULL); seconds = t1.tv_sec - t0.tv_sec; useconds = t1.tv_usec - t0.tv_usec; mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; //printf("Time for matrix_prod = %f sec\n",t1-t0); printf("%d\t%lld\t%lld\t%lld\t%ld\n", n, values[0], values[1], values[2], mtime); }
int main(int argc, char **argv) { int events[1],i; long long counts[1]; int retval,quiet; int l1_size,l2_size,l1_linesize,l2_entries; int arraysize; char test_string[]="Testing PAPI_L2_DCM predefined event..."; quiet=test_quiet(); retval = PAPI_library_init(PAPI_VER_CURRENT); if (retval != PAPI_VER_CURRENT) { if (!quiet) printf("Error! PAPI_library_init %d\n",retval); test_fail(test_string); } retval = PAPI_query_event(PAPI_L2_DCM); if (retval != PAPI_OK) { if (!quiet) printf("PAPI_L2_DCM not available\n"); test_skip(test_string); } events[0]=PAPI_L2_DCM; l1_size=get_cachesize(L1D_CACHE,quiet,test_string); l1_linesize=get_linesize(L1D_CACHE,quiet,test_string); l2_size=get_cachesize(L2_CACHE,quiet,test_string); l2_entries=get_entries(L2_CACHE,quiet,test_string); /*******************************************************************/ /* Test if the C compiler uses a sane number of data cache acceess */ /*******************************************************************/ arraysize=l2_size/sizeof(double); double *array; double aSumm = 0.0; if (!quiet) { printf("Allocating %ld bytes of memory (%d doubles)\n", arraysize*sizeof(double),arraysize); } array=calloc(arraysize,sizeof(double)); if (array==NULL) { if (!quiet) printf("Error! Can't allocate memory\n"); test_fail(test_string); } if (!quiet) printf("Write test:\n"); PAPI_start_counters(events,1); for(i=0; i<arraysize; i++) { array[i]=(double)i; } PAPI_stop_counters(counts,1); if (!quiet) { printf("\tL2 D misses: %lld\n",counts[0]); printf("\tShould be roughly (%d/(%d/%ld)): %ld\n", arraysize,l1_linesize,sizeof(double), arraysize/(l1_linesize/sizeof(double))); } PAPI_start_counters(events,1); for(i=0; i<arraysize; i++) { aSumm += array[i]; } PAPI_stop_counters(counts,1); if (!quiet) { printf("Read test (%lf):\n",aSumm); printf("\tL2 D misses: %lld\n",counts[0]); printf("\tShould be roughly (%d/(%d/%ld)): %ld\n", arraysize,l1_linesize,sizeof(double), arraysize/(l1_linesize/sizeof(double))); } PAPI_shutdown(); test_pass(test_string); return 0; }
/** * The main host function called from outside, as part of the API for a single node. */ unsigned int NumericFormFactorC::compute_form_factor(int rank, // #ifndef __SSE3__ real_vec_t &shape_def, // #else // real_t* shape_def, unsigned int num_triangles, // #endif complex_t* &ff, real_t* &qx, int nqx, real_t* &qy, int nqy, complex_t* &qz, int nqz, real_t* &rot, real_t& kernel_time, real_t& red_time, real_t& mem_time #ifdef FINDBLOCK , const int block_x, const int block_y, const int block_z, const int block_t #endif ) { double temp_mem_time = 0.0, total_mem_time = 0.0; #ifdef _OPENMP if(rank == 0) std::cout << "++ Number of OpenMP threads: " << omp_get_max_threads() << std::endl; #endif // #ifndef __SSE3__ unsigned int num_triangles = shape_def.size() / CPU_T_PROP_SIZE_; // #endif if(num_triangles < 1) return 0; // #ifdef INTEL_SB_AVX // unsigned int shape_padding = (32 - (num_triangles & 31)) & 31; // #elif defined __SSE3__ // unsigned int shape_padding = (16 - (num_triangles & 15)) & 15; // #endif //#ifndef FF_NUM_CPU_PADDING unsigned long int total_qpoints = nqx * nqy * nqz; unsigned long int host_mem_usage = ((unsigned long int) nqx + nqy) * sizeof(real_t) + nqz * sizeof(complex_t); //#else // padding to 16 bytes //const unsigned int PAD_LINE_ = 16; //unsigned int pad_x = 0; //if(nqx != 1) pad_x = (PAD_LINE_ - (nqx % PAD_LINE_)) % PAD_LINE_; //unsigned int pad_y = (PAD_LINE_ - (nqy % PAD_LINE_)) % PAD_LINE_; //unsigned int pad_z = (PAD_LINE_ - (nqz % PAD_LINE_)) % PAD_LINE_; //unsigned int pnqx = nqx + pad_x, pnqy = nqy + pad_y, pnqz = nqz + pad_z; //unsigned long int total_qpoints = pnqx * pnqy * pnqz; //unsigned long int host_mem_usage = ((unsigned long int) pnqx + pnqy) * sizeof(real_t) + // pnqz * sizeof(complex_t); //#endif // allocate memory for the final FF 3D matrix ff = new (std::nothrow) complex_t[total_qpoints]; // allocate and initialize to 0 memset(ff, 0, total_qpoints * sizeof(complex_t)); if(ff == NULL) { std::cerr << "Memory allocation failed for ff. Size = " << total_qpoints * sizeof(complex_t) << " b" << std::endl; return 0; } // if host_mem_usage += total_qpoints * sizeof(complex_t); //unsigned long int matrix_size = (unsigned long int) nqx * nqy * nqz * num_triangles; // do hyperblocking to use less memory unsigned int b_nqx = 0, b_nqy = 0, b_nqz = 0, b_num_triangles = 0; #ifndef FF_NUM_CPU_AUTOTUNE_HB compute_block_size(nqx, nqy, nqz, num_triangles, b_nqx, b_nqy, b_nqz, b_num_triangles #ifdef FINDBLOCK , block_x, block_y, block_z, block_t #endif ); #else std::cout << "-- Autotuning hyperblock size ... " << std::endl; double min_time_hb = 1000000.0; unsigned int min_b_nqx = 1, min_b_nqy = 1, min_b_nqz = 1, min_b_num_triangles = 1; woo::BoostChronoTimer at_kernel_timer, at_overhead_timer; at_overhead_timer.start(); complex_t* ff_temp; ff_temp = new (std::nothrow) complex_t[nqx * nqy * nqz]; for(int b_nqx_i = 1; b_nqx_i <= nqx; ++ b_nqx_i) { for(int b_nqy_i = 10; b_nqy_i <= nqy; b_nqy_i += 10) { for(int b_nqz_i = 10; b_nqz_i <= nqz; b_nqz_i += 10) { for(int b_nt_i = 10; b_nt_i <= num_triangles; b_nt_i += 10) { at_kernel_timer.start(); // compute the number of sub-blocks, along each of the 4 dimensions unsigned int nb_x = (unsigned int) ceil((float) nqx / b_nqx_i); unsigned int nb_y = (unsigned int) ceil((float) nqy / b_nqy_i); unsigned int nb_z = (unsigned int) ceil((float) nqz / b_nqz_i); unsigned int nb_t = (unsigned int) ceil((float) num_triangles / b_nt_i); unsigned int num_blocks = nb_x * nb_y * nb_z * nb_t; form_factor_kernel_fused_nqx1(qx, qy, qz, shape_def, b_nqx_i, b_nqy_i, b_nqz_i, b_nt_i, b_nqx_i, b_nqy_i, b_nqz_i, b_nt_i, nqx, nqy, nqz, num_triangles, 0, 0, 0, 0, rot, ff); at_kernel_timer.stop(); double curr_time = at_kernel_timer.elapsed_msec(); double tot_time = curr_time * num_blocks; std::cout << "## " << b_nqx_i << " x " << b_nqy_i << " x " << b_nqz_i << " x " << b_nt_i << "\t" << num_blocks << "\t:\t" << curr_time << "\t" << tot_time << std::endl; if(tot_time < min_time_hb) { min_time_hb = tot_time; min_b_nqx = b_nqx_i; min_b_nqy = b_nqy_i; min_b_nqz = b_nqz_i; min_b_num_triangles = b_nt_i; } // if } // for } // for } // for } // for delete[] ff_temp; at_overhead_timer.stop(); b_nqx = min_b_nqx; b_nqy = min_b_nqy; b_nqz = min_b_nqz; b_num_triangles = min_b_num_triangles; if(rank == 0) { std::cout << "## HBlock Autotuner overhead: " << at_overhead_timer.elapsed_msec() << " ms." << std::endl; } // if #endif unsigned long int blocked_3d_matrix_size = (unsigned long int) b_nqx * b_nqy * b_nqz; //size_t estimated_host_mem_need = host_mem_usage + blocked_matrix_size * sizeof(complex_t); //if(rank == 0) { // std::cout << "++ Estimated host memory need: " << (float) estimated_host_mem_need / 1024 / 1024 // << " MB" << std::endl; //} // if #ifndef FF_NUM_CPU_FUSED unsigned long int blocked_matrix_size = (unsigned long int) blocked_3d_matrix_size * b_num_triangles; host_mem_usage += blocked_matrix_size * sizeof(complex_t); complex_t *fq_buffer = new (std::nothrow) complex_t[blocked_matrix_size](); if(fq_buffer == NULL) { std::cerr << "Memory allocation failed for fq_buffer. blocked_matrix_size = " << blocked_matrix_size << std::endl << "Host memory usage = " << (float) host_mem_usage / 1024 / 1024 << " MB" << std::endl; delete[] ff; return 0; } // if #endif if(rank == 0) { std::cout << "++ Host memory usage: " << (float) host_mem_usage / 1024 / 1024 << " MB" << std::endl << std::flush; } // if // compute the number of sub-blocks, along each of the 4 dimensions // formulate loops over each dimension, to go over each sub block unsigned int nb_x = (unsigned int) ceil((float) nqx / b_nqx); unsigned int nb_y = (unsigned int) ceil((float) nqy / b_nqy); unsigned int nb_z = (unsigned int) ceil((float) nqz / b_nqz); unsigned int nb_t = (unsigned int) ceil((float) num_triangles / b_num_triangles); unsigned int curr_b_nqx = b_nqx, curr_b_nqy = b_nqy, curr_b_nqz = b_nqz; unsigned int curr_b_num_triangles = b_num_triangles; unsigned int num_blocks = nb_x * nb_y * nb_z * nb_t; #ifdef TIME_DETAIL_2 if(rank == 0) { std::cout << "++ Hyperblock size: " << b_nqx << " x " << b_nqy << " x " << b_nqz << " x " << b_num_triangles << std::endl; std::cout << "++ Number of decomposed Hblocks: " << num_blocks << " [" << nb_x << " x " << nb_y << " x " << nb_z << " x " << nb_t << "]" << std::endl; } // if #endif // TIME_DETAIL_2 unsigned int block_num = 0; #ifdef PROFILE_PAPI long long int papi_total_cycles = 0, papi_total_inst = 0, papi_total_flop = 0; double overall_ipc = 0.0; #endif if(rank == 0) std::cout << "-- Computing form factor on CPU ... " << std::flush; woo::BoostChronoTimer kernel_timer; kernel_timer.start(); // compute for each hyperblock curr_b_nqx = b_nqx; for(unsigned int ib_x = 0; ib_x < nb_x; ++ ib_x) { if(ib_x == nb_x - 1) curr_b_nqx = nqx - b_nqx * ib_x; curr_b_nqy = b_nqy; for(unsigned int ib_y = 0; ib_y < nb_y; ++ ib_y) { if(ib_y == nb_y - 1) curr_b_nqy = nqy - b_nqy * ib_y; curr_b_nqz = b_nqz; for(unsigned int ib_z = 0; ib_z < nb_z; ++ ib_z) { if(ib_z == nb_z - 1) curr_b_nqz = nqz - b_nqz * ib_z; curr_b_num_triangles = b_num_triangles; for(unsigned int ib_t = 0; ib_t < nb_t; ++ ib_t) { if(ib_t == nb_t - 1) curr_b_num_triangles = num_triangles - b_num_triangles * ib_t; #ifdef PROFILE_PAPI // PAPI_L1_DCM 0x80000000 No Level 1 data cache misses // PAPI_L1_ICM 0x80000001 No Level 1 instruction cache misses // PAPI_L2_DCM 0x80000002 No Level 2 data cache misses // PAPI_L2_ICM 0x80000003 No Level 2 instruction cache misses // PAPI_L1_TCM 0x80000006 Yes Level 1 cache misses // PAPI_L2_TCM 0x80000007 No Level 2 cache misses // PAPI_FPU_IDL 0x80000012 No Cycles floating point units are idle // PAPI_TLB_DM 0x80000014 No Data translation lookaside buffer misses // PAPI_TLB_IM 0x80000015 No Instruction translation lookaside buffer misses // PAPI_TLB_TL 0x80000016 Yes Total translation lookaside buffer misses // PAPI_STL_ICY 0x80000025 No Cycles with no instruction issue // PAPI_HW_INT 0x80000029 No Hardware interrupts // PAPI_BR_TKN 0x8000002c No Conditional branch instructions taken // PAPI_BR_MSP 0x8000002e No Conditional branch instructions mispredicted // PAPI_TOT_INS 0x80000032 No Instructions completed // PAPI_FP_INS 0x80000034 No Floating point instructions // PAPI_BR_INS 0x80000037 No Branch instructions // PAPI_VEC_INS 0x80000038 No Vector/SIMD instructions (could include integer) // PAPI_RES_STL 0x80000039 No Cycles stalled on any resource // PAPI_TOT_CYC 0x8000003b No Total cycles // PAPI_L1_DCH 0x8000003e Yes Level 1 data cache hits // PAPI_L2_DCH 0x8000003f Yes Level 2 data cache hits // PAPI_L1_DCA 0x80000040 No Level 1 data cache accesses // PAPI_L2_DCA 0x80000041 No Level 2 data cache accesses // PAPI_L1_ICH 0x80000049 Yes Level 1 instruction cache hits // PAPI_L2_ICH 0x8000004a No Level 2 instruction cache hits // PAPI_L1_ICA 0x8000004c No Level 1 instruction cache accesses // PAPI_L2_ICA 0x8000004d No Level 2 instruction cache accesses // PAPI_L1_ICR 0x8000004f No Level 1 instruction cache reads // PAPI_L1_TCH 0x80000055 Yes Level 1 total cache hits // PAPI_L2_TCH 0x80000056 Yes Level 2 total cache hits // PAPI_L1_TCA 0x80000058 Yes Level 1 total cache accesses // PAPI_L2_TCA 0x80000059 No Level 2 total cache accesses // PAPI_FML_INS 0x80000061 No Floating point multiply instructions // PAPI_FAD_INS 0x80000062 No Floating point add instructions // (Also includes subtract instructions) // PAPI_FDV_INS 0x80000063 No Floating point divide instructions // (Counts both divide and square root instructions) // PAPI_FSQ_INS 0x80000064 No Floating point square root instructions // (Counts both divide and square root instructions) // PAPI_FP_OPS 0x80000066 No Floating point operations // PAPI_SP_OPS 0x80000067 No Floating point operations; optimized to count // scaled single precision vector operations // PAPI_DP_OPS 0x80000068 No Floating point operations; optimized to count // scaled double precision vector operations int papi_events[3] = { PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_FP_OPS }; //int papi_events[3] = { PAPI_FML_INS, PAPI_FAD_INS, PAPI_FDV_INS }; //int papi_events[3] = { PAPI_FP_OPS, PAPI_SP_OPS, PAPI_DP_OPS }; long long papi_counter_values[3]; PAPI_start_counters(papi_events, 3); #endif // call the main kernel #ifndef FF_NUM_CPU_FUSED // DO NOT USE THIS form_factor_kernel(qx, qy, qz, shape_def, curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles, b_nqx, b_nqy, b_nqz, b_num_triangles, ib_x, ib_y, ib_z, ib_t, fq_buffer); #else if(nqx == 1) { form_factor_kernel_fused_nqx1(qx, qy, qz, shape_def, //form_factor_kernel_fused_nqx1_unroll4(qx, qy, qz, shape_def, curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles, b_nqx, b_nqy, b_nqz, b_num_triangles, nqx, nqy, nqz, num_triangles, ib_x, ib_y, ib_z, ib_t, rot, ff); } else { // #ifdef __SSE3__ // if(rank == 0) // std::cout << "uh-oh: no SSE3 version!" << std::endl; // #else form_factor_kernel_fused_unroll4(qx, qy, qz, shape_def, curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles, b_nqx, b_nqy, b_nqz, b_num_triangles, nqx, nqy, nqz, num_triangles, ib_x, ib_y, ib_z, ib_t, rot, ff); // #endif // __SSE3__ } // if-else #endif #ifndef FF_NUM_CPU_FUSED // DO NOT USE THIS // call the reduction kernel reduction_kernel(curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles, blocked_matrix_size, b_nqx, b_nqy, b_nqz, num_triangles, nqx, nqy, nqz, ib_x, ib_y, ib_z, ib_t, fq_buffer, ff); #endif #ifdef PROFILE_PAPI PAPI_stop_counters(papi_counter_values, 3); papi_total_cycles += papi_counter_values[0]; papi_total_inst += papi_counter_values[1]; papi_total_flop += papi_counter_values[2]; #endif } // for ib_t } // for ib_z } // for ib_y } // for ib_x kernel_timer.stop(); kernel_time = kernel_timer.elapsed_msec(); #ifndef FF_NUM_CPU_FUSED delete[] fq_buffer; #endif if(rank == 0) std::cout << "done." << std::endl; #ifdef PROFILE_PAPI if(rank == 0) { std::cout << "++ PAPI_TOT_CYC: " << papi_total_cycles << std::endl; std::cout << "++ PAPI_TOT_INS: " << papi_total_inst << std::endl; std::cout << "++ PAPI_FP_OPS: " << papi_total_flop << std::endl; std::cout << "++ IPC: " << (double) papi_total_inst / papi_total_cycles << std::endl; } // if #endif return num_triangles; } // NumericFormFactorC::compute_form_factor()
main(int argc, char *argv[]) { float **a,**b,**c; int n,n1,n2; int i,j; //double t0,t1; struct timeval t0,t1; long mtime, seconds, useconds; // Using PAPI - from countloop.c if (PAPI_VER_CURRENT != PAPI_library_init(PAPI_VER_CURRENT)) ehandler("PAPI_library_init error."); const size_t EVENT_MAX = PAPI_num_counters(); // Suppressing output // printf("# Max counters = %zd\n", EVENT_MAX); if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS)) ehandler("Cannot count PAPI_TOT_INS."); if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS)) ehandler("Cannot count PAPI_FP_OPS."); if (PAPI_OK != PAPI_query_event(PAPI_L1_DCM)) ehandler("Cannot count PAPI_L1_DCM."); size_t EVENT_COUNT = 3; int events[] = { PAPI_TOT_INS, PAPI_FP_OPS, PAPI_L1_DCM }; long long values[EVENT_COUNT]; // Take size from args, not prompt // printf("Enter n: "); scanf("%d",&n); printf("n = %d\n",n); n = atoi(argv[1]); //printf("Enter n1: "); scanf("%d",&n1); printf("n1 = %d\n",n1); //printf("Enter n2: "); scanf("%d",&n2); printf("n2 = %d\n",n2); // To conform to the other matrix functions n1 = floor(sqrt(n)); n2 = n1; n = n1*n2; //printf("n = %d X %d = %d\n",n1,n2,n); a = matrix(1,n,1,n); for (i=1;i<=n;i++) for (j=1;j<=n;j++) a[i][j] = i+j; b = matrix(1,n,1,n); for (i=1;i<=n;i++) for (j=1;j<=n;j++) b[i][j] = i-j; //#ifdef PRINT //print_matrix(a,1,n,1,n); //printf("\n"); */ //print_matrix(b,1,n,1,n); //printf("\n"); */ //#endif //t0 = get_seconds(); //c = matrix_prod(n,n,n,n,a,b); //t1 = get_seconds(); //printf("Time for matrix_prod = %f sec\n",t1-t0); //t0 = get_seconds(); gettimeofday(&t0, NULL); // Start PAPI PAPI_start_counters(events, EVENT_COUNT); if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); c = block_prod(n1,n1,n1,n2,n2,n2,a,b); if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); //t1 = get_seconds(); //printf("Time for block_prod = %f sec\n",t1-t0); gettimeofday(&t1, NULL); seconds = t1.tv_sec - t0.tv_sec; useconds = t1.tv_usec - t0.tv_usec; mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; //printf("Time for matrix_prod = %f sec\n",t1-t0); printf("%d\t%lld\t%lld\t%lld\t%ld\n", n, values[0], values[1], values[2], mtime); }
int run_nothing(void *_p, unsigned long long *data, int *data_len) { struct elim_params *p = (struct elim_params *)_p; mzd_t *A = mzd_init(p->m, p->n); if(p->r != 0) { mzd_t *L, *U; L = mzd_init(p->m, p->m); U = mzd_init(p->m, p->n); mzd_randomize(U); mzd_randomize(L); for (rci_t i = 0; i < p->m; ++i) { for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->m - j); mzd_clear_bits(L, i, j, length); } mzd_write_bit(L,i,i, 1); for (rci_t j = 0; j < i && j <p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, i - j); mzd_clear_bits(U, i, j, length); } if(i < p->r) { mzd_write_bit(U, i, i, 1); } else { for (rci_t j = i; j < p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->n - j); mzd_clear_bits(U, i, j, length); } } } mzd_mul(A,L,U,0); mzd_free(L); mzd_free(U); } else { mzd_randomize(A); } #ifndef HAVE_LIBPAPI *data_len = 2; #else *data_len = MIN(papi_array_len + 1, *data_len); #endif int papi_res; #ifndef HAVE_LIBPAPI data[0] = walltime(0); data[1] = cpucycles(); #else int array_len = *data_len - 1; unsigned long long t0 = PAPI_get_virt_usec(); papi_res = PAPI_start_counters((int*)papi_events, array_len); if(papi_res) m4ri_die(""); #endif #ifndef HAVE_LIBPAPI data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); #else PAPI_stop_counters((long long*)&data[1], array_len); t0 = PAPI_get_virt_usec() - t0; data[0] = t0; for (int nv = 0; nv <= array_len; ++nv) { if (data[nv] < loop_calibration[nv]) loop_calibration[nv] = data[nv]; } #endif mzd_free(A); return (0); }
void M3_profile( int sectionID, const char *sectionName, int operationFlag ) { static char *staticTitleString = NULL; static char **staticProfileName = NULL; static int64_t *staticNumCalls = NULL; static double *staticTotalTime = NULL; static double *staticStartTime = NULL; #ifdef USE_PAPI static int64_t *staticFlopCount = NULL; static int64_t *staticFlipCount = NULL; static int64_t *staticFlopCounter = NULL; static int64_t *staticFlipCounter = NULL; #endif static double staticInitTime = 0; static char staticInitDate[256]={0}; static int staticProfileLevel = -1; #ifdef USE_PAPI #define M3_NUM_PAPI_EVENTS 2 int papiEvents[M3_NUM_PAPI_EVENTS] = {PAPI_FP_OPS, PAPI_FP_INS}; static long long int papiCounters[M3_NUM_PAPI_EVENTS] = {0}; #endif double finalTime; int64_t *agInt64 = NULL; double *agDouble = NULL; int64_t i, j; long int k; int myRank = -1; int numProc = 1; FILE *outFile; char *tempPtr, fileName[256], tempString[256]; char myHostname[256] = {0}; double mpiTic; double mpiToc; struct timeval tic; struct timezone tz; time_t tt; long int pid; char pcontrolID[16] = {0}; if( staticProfileLevel == -1 ) { /* Look for environment variable. */ tempPtr = getenv("M3_PROFILE_LEVEL"); if( tempPtr ) staticProfileLevel = atoi( tempPtr ); else staticProfileLevel = M3_PROFILE_LEVEL; } if( staticProfileLevel == 0 ) return; #ifdef USE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &myRank ); MPI_Comm_size(MPI_COMM_WORLD, &numProc ); #endif sprintf(fileName, "M3_Profile(): profile ID out of range, must be between 0 and %i", M3_PROFILE_MAX_SECTIONS - 1); assert(sectionID >= 0 && sectionID < M3_PROFILE_MAX_SECTIONS); switch( operationFlag ) { case M3_PROFILE_INIT: assert(staticProfileName == NULL && staticNumCalls == NULL && staticTotalTime == NULL && staticStartTime == NULL); if( sectionName && strlen(sectionName) ) { staticTitleString = (char *)calloc( 4*(strlen(sectionName)/4 +1 ), sizeof(char) ); assert(staticTitleString != NULL); strcpy(staticTitleString, sectionName ); } staticProfileName = (char **)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(char*) ); staticNumCalls = (int64_t *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(int64_t) ); staticTotalTime = (double *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(double) ); staticStartTime = (double *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(double) ); assert(staticProfileName && staticNumCalls && staticTotalTime && staticStartTime); #ifdef USE_PAPI staticFlopCount = (int64_t *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(int64_t)); staticFlipCount = (int64_t *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(int64_t)); staticFlopCounter = (int64_t *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(int64_t)); staticFlipCounter = (int64_t *)calloc( M3_PROFILE_MAX_SECTIONS, sizeof(int64_t)); assert(staticFlopCount && staticFlipCount); assert(staticFlopCounter && staticFlipCounter); #endif gettimeofday(&tic, &tz); #ifdef USE_MPI staticInitTime = MPI_Wtime( ); #else staticInitTime = tic.tv_sec + tic.tv_usec*1e-6; #endif tt = tic.tv_sec; ctime_r(&tt, staticInitDate ); #ifdef USE_PAPI PAPI_start_counters(papiEvents, M3_NUM_PAPI_EVENTS); #endif #ifdef USE_MPI if (myRank == 0) { mkdir( "m3_profile", S_IRWXU ); } #else mkdir( "m3_profile", S_IRWXU ); #endif break; case M3_PROFILE_FINALIZE: /* Check to see if it was initialized */ if( staticProfileName == NULL || staticNumCalls == NULL || staticTotalTime == NULL ) { /* fprintf(stderr, "WARNING: M3_Profile, finalized without initializing\n"); */ break; } myHostname[255] = 0; gethostname(myHostname, 255); pid = (long int)getpid(); for( j = 0; j < 2; j++ ) { #ifdef USE_MPI if( j == 1 ) { /* Get aggregate statistics */ if( myRank == 0 ) { agInt64 = (int64_t*)calloc(M3_PROFILE_MAX_SECTIONS, sizeof(int64_t)); agDouble = (double*)calloc(M3_PROFILE_MAX_SECTIONS, sizeof(double)); assert( agInt64 && agDouble ); } MPI_Reduce( staticNumCalls, agInt64, M3_PROFILE_MAX_SECTIONS, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD ); if( myRank == 0 ) memcpy( staticNumCalls, agInt64 , sizeof(int64_t)*M3_PROFILE_MAX_SECTIONS ); MPI_Reduce( staticTotalTime, agDouble, M3_PROFILE_MAX_SECTIONS, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD ); if( myRank == 0 ) memcpy( staticTotalTime, agDouble , sizeof(double)*M3_PROFILE_MAX_SECTIONS ); #ifdef USE_PAPI MPI_Reduce( staticFlopCount, agInt64, M3_PROFILE_MAX_SECTIONS, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD ); if( myRank == 0 ) memcpy( staticFlopCount, agInt64, sizeof(int64_t)*M3_PROFILE_MAX_SECTIONS ); MPI_Reduce( staticFlipCount, agInt64, M3_PROFILE_MAX_SECTIONS, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD ); if( myRank == 0 ) memcpy( staticFlipCount, agInt64, sizeof(int64_t)*M3_PROFILE_MAX_SECTIONS ); #endif if( myRank == 0 ) { free(agInt64); free(agDouble); } else break; } #else /* If not using mpi, don't need to collect aggregate statistics */ if( j == 1 ) break; #endif k = 60*lrint(staticInitTime/60); /* m3_profile_title_date.proc */ if( staticTitleString ) { tempPtr = strchr( staticTitleString, ' '); if(tempPtr) *tempPtr = '\0'; sprintf( fileName, "m3_profile/m3_profile_%s_%li_%s_%li", staticTitleString, k, myHostname, pid); if(tempPtr) *tempPtr = ' '; } else { sprintf( fileName, "m3_profile/m3_profile_%li", k ); } #ifdef USE_MPI if( j == 0 ) sprintf( tempString, ".%i", myRank ); else strcpy( tempString, ".all"); strcat( fileName, tempString ); #endif if( ( staticProfileLevel == 2 ) || ( staticProfileLevel == 1 && j == 0 && numProc == 1 ) || ( staticProfileLevel == 1 && j == 1 ) ) { /* Open the output file. */ outFile = fopen( fileName, "w"); assert(outFile != NULL); /* Write a title */ if( staticTitleString ) fprintf(outFile, "M3_Profile: %s\n\n", staticTitleString ); else fprintf(outFile, "M3_Profile\n\n" ); /* Write the init date, and the run time. */ #ifdef USE_MPI fprintf(outFile, "Number of processors: %i\n", numProc ); finalTime = MPI_Wtime(); #else gettimeofday(&tic, &tz ); finalTime = tic.tv_sec + tic.tv_usec*1e-6; #endif fprintf( outFile, "Start date %s\n", staticInitDate ); fprintf( outFile, "Run time in seconds: %e\n\n", finalTime - staticInitTime ); if( j == 1 ) fprintf(outFile, "Aggregate statistics\n\n"); for( i = 0; i < M3_PROFILE_MAX_SECTIONS; i++ ) { if( staticNumCalls[i] ) { fprintf(outFile, "-----------------------------\n"); fprintf(outFile, " Profile ID number: %lli\n", i); if( staticProfileName[i] ) fprintf(outFile, " %s\n", staticProfileName[i] ); fprintf(outFile, " Total number of calls: %lli\n", staticNumCalls[i]); fprintf(outFile, " Total time (seconds): %e\n", staticTotalTime[i]); fprintf(outFile, " Mean time per call (seconds): %e\n", staticTotalTime[i]/staticNumCalls[i]); fprintf(outFile, " Mean time per task (seconds): %e\n", staticTotalTime[i]/numProc ); fprintf(outFile, " Percent of wall clock %.6f %%\n", staticTotalTime[i]/numProc/(finalTime - staticInitTime)*100 ); #ifdef USE_PAPI fprintf(outFile, " Flop count: %lli\n", staticFlopCount[i]); fprintf(outFile, " Flop rate: %.6e\n", staticFlopCount[i]/staticTotalTime[i]); fprintf(outFile, " Flip count: %lli\n", staticFlipCount[i]); fprintf(outFile, " Flip rate: %.6e\n", staticFlipCount[i]/staticTotalTime[i]); #endif fprintf(outFile, "\n\n"); } } fclose(outFile); } } /* Free up static memory */ if( staticTitleString ) { free(staticTitleString); staticTitleString = NULL; } if( staticProfileName ) { for( i = 0; i < M3_PROFILE_MAX_SECTIONS; i++ ) if( staticProfileName[i] ) free( staticProfileName[i] ); free(staticProfileName); staticProfileName = NULL; } if( staticNumCalls ) { free( staticNumCalls ); staticNumCalls = NULL; } if( staticTotalTime ) { free( staticTotalTime ); staticTotalTime = NULL; } if( staticStartTime ) { free(staticStartTime ); staticStartTime = NULL; } #ifdef USE_PAPI if( staticFlopCount ); { free(staticFlopCount); staticFlopCount = NULL; } if( staticFlipCount ); { free(staticFlipCount); staticFlipCount = NULL; } #endif break; case M3_PROFILE_START: if( staticProfileName == NULL || staticNumCalls == NULL || staticTotalTime == NULL ) { /* fprintf(stderr, "WARNING: M3_Profile, called without initializing\n"); */ break; } if( staticProfileName[sectionID] == NULL ) { staticProfileName[sectionID] = (char*)calloc(4*(strlen(sectionName)/4 + 1), sizeof(char)); assert(staticProfileName[sectionID] != NULL); strcpy(staticProfileName[sectionID], sectionName); } #ifdef USE_MPI #ifndef USE_PAPI sprintf( pcontrolID, "%i", sectionID); MPI_Pcontrol( 1, pcontrolID ); #endif #endif #ifdef USE_MPI staticStartTime[sectionID] = MPI_Wtime(); #else gettimeofday(&tic, &tz); staticStartTime[sectionID] = tic.tv_sec + tic.tv_usec*1e-6; #endif #ifdef USE_PAPI PAPI_accum_counters(papiCounters, M3_NUM_PAPI_EVENTS ); staticFlopCounter[sectionID] = papiCounters[0]; staticFlipCounter[sectionID] = papiCounters[1]; #endif break; case M3_PROFILE_STOP: if( staticProfileName == NULL || staticNumCalls == NULL || staticTotalTime == NULL ) { /* fprintf(stderr, "WARNING: M3_Profile, called without initializing\n"); */ break; } #ifdef USE_MPI #ifndef USE_PAPI sprintf( pcontrolID, "%i", sectionID); MPI_Pcontrol( -1, pcontrolID ); #endif #endif staticNumCalls[sectionID]++; #ifdef USE_MPI staticTotalTime[sectionID] += MPI_Wtime() - staticStartTime[sectionID]; #else gettimeofday(&tic, &tz); staticTotalTime[sectionID] += (tic.tv_sec + tic.tv_usec*1e-6) - staticStartTime[sectionID]; #endif #ifdef USE_PAPI PAPI_accum_counters(papiCounters, M3_NUM_PAPI_EVENTS ); staticFlopCount[sectionID] += papiCounters[0] - staticFlopCounter[sectionID]; staticFlipCount[sectionID] += papiCounters[1] - staticFlipCounter[sectionID]; #endif break; } }
int run(void *_p, unsigned long long *data, int *data_len) { struct elim_params *p = (struct elim_params *)_p; #ifndef HAVE_LIBPAPI *data_len = 2; #else *data_len = MIN(papi_array_len + 1, *data_len); #endif int papi_res; mzd_t *A = mzd_init(p->m, p->n); if(p->r != 0) { mzd_t *L, *U; L = mzd_init(p->m, p->m); U = mzd_init(p->m, p->n); mzd_randomize(U); mzd_randomize(L); for (rci_t i = 0; i < p->m; ++i) { for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->m - j); mzd_clear_bits(L, i, j, length); } mzd_write_bit(L,i,i, 1); for (rci_t j = 0; j < i && j < p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, i - j); mzd_clear_bits(U, i, j, length); } if(i < p->r) { mzd_write_bit(U, i, i, 1); } else { for (rci_t j = i; j < p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->n - i); mzd_clear_bits(U, i, j, length); } } } mzd_mul(A,L,U,0); mzd_free(L); mzd_free(U); } else { mzd_randomize(A); } mzp_t *P = mzp_init(A->nrows); mzp_t *Q = mzp_init(A->ncols); #ifndef HAVE_LIBPAPI data[0] = walltime(0); data[1] = cpucycles(); #else int array_len = *data_len - 1; unsigned long long t0 = PAPI_get_virt_usec(); papi_res = PAPI_start_counters((int*)papi_events, array_len); if (papi_res) m4ri_die(""); #endif if(strcmp(p->algorithm, "m4ri") == 0) p->r = mzd_echelonize_m4ri(A, 0, 0); else if(strcmp(p->algorithm, "ple") == 0) p->r = mzd_ple(A, P, Q, 0); else if(strcmp(p->algorithm, "mmpf") == 0) p->r = _mzd_ple_russian(A, P, Q, 0); else m4ri_die("unknown algorithm %s",p->algorithm); #ifndef HAVE_LIBPAPI data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); #else mzp_free(P); mzp_free(Q); PAPI_stop_counters((long long*)&data[1], array_len); t0 = PAPI_get_virt_usec() - t0; data[0] = t0; for (int nv = 0; nv <= array_len; ++nv) { data[nv] -= loop_calibration[nv]; } #endif mzd_free(A); return 0; }