/** Log performance counters into "pstats.dat" */ static void log_counters( FILE *out, const char *phase, long long *pTic, long long values[] ) { long long toc = PAPI_get_real_usec(); #ifdef CACHE_PROFILE if( PAPI_read_counters( values, 4 ) != PAPI_OK ) { handle_error( 1 ); } fprintf( out, "%s PAPI_L2_TCM %lld\n", phase, values[0] ); fprintf( out, "%s PAPI_L2_TCA %lld\n", phase, values[2] ); fprintf( out, "%s L2MissRate %.4lf%\n", phase, ( double )values[0] / ( double )values[2] ); fprintf( out, "%s PAPI_L3_TCM %lld\n", phase, values[1] ); fprintf( out, "%s PAPI_L3_TCA %lld\n", phase, values[3] ); fprintf( out, "%s L3MissRate %.4lf%\n", phase, ( double )values[1] / ( double )values[3] ); #else if( PAPI_read_counters( values, 1 ) != PAPI_OK ) { handle_error( 1 ); } fprintf( out, "%s PAPI_FP_OPS %lld\n", phase, values[0] ); #endif fprintf( out, "%s RealTime %.4lfs\n", phase, ( toc - *pTic ) * 1e-6 ); *pTic = PAPI_get_real_usec(); }
int main(int argc, char *argv[]) { double a[MAXVSIZE], b[MAXVSIZE], c[MAXVSIZE]; int i,n; long long before, after; if (PAPI_VER_CURRENT != PAPI_library_init(PAPI_VER_CURRENT)) ehandler("PAPI_library_init error."); const size_t EVENT_MAX = PAPI_num_counters(); printf("# Max counters = %zd\n", EVENT_MAX); if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS)) ehandler("Cannot count PAPI_TOT_INS."); if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS)) ehandler("Cannot count PAPI_FP_OPS."); size_t EVENT_COUNT = 2; int events[] = { PAPI_TOT_INS, PAPI_FP_OPS }; long long values[EVENT_COUNT]; printf("Enter vector size: "); scanf("%d",&n); for (i=0;i<n;i++) { a[i] = i; b[i] = n-i; } PAPI_start_counters(events, EVENT_COUNT); if(PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); loop(c,a,b,n); if(PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); printf("Number of instructions = %lld\n",values[0]); printf("Number of fp operations = %lld\n",values[1]); return 0; }
double papi_generic_call() { long_long tmp[1]; /* Read and reset the counters. * The commented out conditional affects the reading of the performance * counters, but might be good for debugging. * NOTE: PAPI_accum_counters does not work properly. * */ #if 0 if (PAPI_read_counters(tmp, 1) != PAPI_OK) papi_eprintf("Problem reading counters %s:%d.\n", __FILE__, __LINE__); #else PAPI_read_counters(tmp, 1); #endif return ((double) tmp[0]); }
int main(int argc, char **argv) { int retval; retval = PAPI_library_init(PAPI_VER_CURRENT); if (retval != PAPI_VER_CURRENT) { fprintf(stderr,"Error! PAPI_library_init %d\n", retval); } retval = PAPI_query_event(PAPI_TOT_INS); if (retval != PAPI_OK) { fprintf(stderr,"PAPI_TOT_INS not supported\n"); exit(1); } int i; int events[1],result; long long counts[1]; long long total=0,average,max=0,min=0x7ffffffffffffffULL; events[0]=PAPI_TOT_INS; PAPI_start_counters(events,1); for(i=0;i<NUM_RUNS;i++) { result=instructions_million(); PAPI_read_counters(counts,1); results[i]=counts[0]; } PAPI_stop_counters(counts,1); PAPI_shutdown(); for(i=0;i<NUM_RUNS;i++) { total+=results[i]; if (results[i]>max) max=results[i]; if (results[i]<min) min=results[i]; } average=total/NUM_RUNS; printf("Average=%lld max=%lld min=%lld\n",average,max,min); (void) result; return 0; }
int main (int argc, char *argv[]) { int i, count; int *array = (int*) malloc (SIZE * sizeof(int)); uint64_t start, end; int events[3] = { PAPI_L1_DCM, PAPI_L2_DCM, PAPI_L3_DCM }; long long misses[3]; int papilevels = 3; if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) { exit(1); } //Initialization count = 0; srand(time(NULL)); init_time(); for (i = 0; i < SIZE; i++) array[i] = rand(); // Uncomment this line to qsort(array, SIZE, sizeof (int), compare_ints); //Measurement while (PAPI_start_counters(events, papilevels) != PAPI_OK) { papilevels--; } start = get_time(); /* * É possível, em um vetor ordenado, fazer a contagem * em tempo O(lg(n)) em vez de O(n) utilizando busca * binária. Embora isto diminua sensivelmente o tempo * este não é o intuito deste exercício. * */ for (i = 0; i < SIZE; i++) if (array[i] < RAND_MAX / 2) count++; end = get_time(); uint64_t exec_time = diff_time(start, end); if (PAPI_read_counters(misses, papilevels) != PAPI_OK) { fprintf(stderr, "Erro em PAPI_read_counters\n"); exit(1); } printf("Time: %" PRIu64 " Count %d\n", exec_time, count); for (i = 0; i < papilevels; i++) { printf("Cache misses (L%d): %lld\n", i+1, misses[i]); } free(array); return 0; }
int test_measure(char* phase) { if ( PAPI_read_counters( values, NUM_EVENTS ) != PAPI_OK ) return -1; else { float rate = (float)values[0] / (float)(values[1]); fprintf(res_file, "%s_PAPI_L1_TCM=%lld\n", phase, values[0]); fprintf(res_file, "%s_PAPI_L1_TCA=%lld\n", phase, values[1]); fprintf(res_file, "%s_L1MissRate=%f%%\n", phase, (rate * 100.0)); return 0; } }
inline void papi_reset(size_t n) { long_long *papi_tmp; papi_tmp = malloc(sizeof(*papi_tmp) * n); if (PAPI_read_counters(papi_tmp, n) != PAPI_OK) papi_eprintf("Problem reading counters %s:%d.\n", __FILE__, __LINE__); free(papi_tmp); }
int main () { float t0, t1; int iter, i, j; int events[2] = {PAPI_L1_DCM, PAPI_FP_OPS }, ret; long_long values[2]; if (PAPI_num_counters() < 2) { fprintf(stderr, "No hardware counters here, or PAPI not supported.\n"); exit(1); } for (i = 0; i < MX; i++) { if ((ad[i] = malloc(sizeof(double)*MX)) == NULL) { fprintf(stderr,"malloc failed\n"); exit(1); } } for (j = 0; j < MX; j++) { for (i = 0; i < MX; i++) { ad[i][j] = 1.0/3.0; /* Initialize the data */ } } t0 = gettime(); if ((ret = PAPI_start_counters(events, 2)) != PAPI_OK) { fprintf(stderr, "PAPI failed to start counters: %s\n", PAPI_strerror(ret)); exit(1); } for (iter = 0; iter < NITER; iter++) { for (j = 0; j < MX; j++) { for (i = 0; i < MX; i++) { ad[i][j] += ad[i][j] * 3.0; } } } if ((ret = PAPI_read_counters(values, 2)) != PAPI_OK) { fprintf(stderr, "PAPI failed to read counters: %s\n", PAPI_strerror(ret)); exit(1); } t1 = gettime(); printf("Total software flops = %f\n",(float)TOT_FLOPS); printf("Total hardware flops = %lld\n",(float)values[1]); printf("MFlop/s = %f\n", (float)(TOT_FLOPS/MEGA)/(t1-t0)); printf("L1 data cache misses is %lld\n", values[0]); }
int main() { //this will fail if some counters can't be accessed if (PAPI_start_counters(papi_events, n_papi_events) != PAPI_OK) { printf("failed to start papi\n"); return 1; } doWork(123); if (PAPI_read_counters(papi_values[0], n_papi_events) != PAPI_OK) { printf("failed to read countess\n"); return 1; } printf("counters' values: misses = %d, accesses = %d\n", papi_values[0][0], papi_values[0][1]); return 0; }
static void start_sssp(FibHeap<size_t, size_t> *pq, vertex_t *graph) { #ifdef PAPI if (PAPI_OK != PAPI_start_counters(g_events, G_EVENT_COUNT)) { std::cout << ("Problem starting counters 1.\n"); } #endif while (!pq->empty()) { size_t distance; size_t node; pq->pop(distance, node); vertex_t *v = &graph[node]; size_t v_dist = v->distance; for (size_t i = 0; i < v->num_edges; i++) { const edge_t *e = &v->edges[i]; const size_t new_dist = v_dist + e->weight; vertex_t *w = &graph[e->target]; size_t w_dist = w->distance; if (new_dist < w_dist) { w->distance = new_dist; if (w->n == NULL) { w->n = pq->push(new_dist, e->target); } else { pq->decrease_key(w->n, new_dist); } } } } #ifdef PAPI if (PAPI_OK != PAPI_read_counters(g_values[0], G_EVENT_COUNT)) { std::cout << ("Problem reading counters 2.\n"); } #endif }
void my_papi_stop(int *events, int NUM_EVENTS) { int j; /* Read the counters */ if (PAPI_read_counters(values, NUM_EVENTS) != PAPI_OK) { fprintf(stderr, "PAPI_read_counters - FAILED\n"); exit(1); } for (j=0; j<NUM_EVENTS; j++) { printf("GG: %d : %lld\n", events[j], values[j]); } /* Stop counting events */ if (PAPI_stop_counters(values, NUM_EVENTS) != PAPI_OK) { fprintf(stderr, "PAPI_stoped_counters - FAILED\n"); exit(1); } if (values != NULL) { free(values); } }
void papi_set_events(char *metric) { const size_t n = 1; int max; long_long *papi_tmp; int papi_events[1]; int code; max = PAPI_num_counters(); if (n > max) papi_eprintf("Too many counters requested.\n"); papi_tmp = malloc(sizeof(*papi_tmp) * n); PAPI_reset(max); PAPI_stop_counters(papi_tmp, n); if (PAPI_event_name_to_code(metric, &code) != PAPI_OK) papi_eprintf("Unknown PAPI event %s.\n", metric); if (code == 0) papi_eprintf("Unknown PAPI event %s.\n", metric); papi_events[0] = code; PAPI_start_counters(papi_events, n); if (PAPI_read_counters(papi_tmp, n) != PAPI_OK) papi_eprintf("Problem reading counters %s:%d.\n", __FILE__, __LINE__); free(papi_tmp); }
void* test(void *data) { int unext, last = -1; val_t val = 0; pval_t pval = 0; thread_data_t *d = (thread_data_t *)data; /* Create transaction */ TM_THREAD_ENTER(d->id); set_cpu(the_cores[d->id]); /* Wait on barrier */ ssalloc_init(); PF_CORRECTION; seeds = seed_rand(); #ifdef PIN int id = d->id; int cpu = 40*(id/40) + 4*(id%10) + (id%40)/10; // printf("Pinning %d to %d\n",id,cpu); pin(pthread_self(), cpu); // pin(pthread_self(), id); #endif #ifdef PAPI if (PAPI_OK != PAPI_start_counters(g_events, G_EVENT_COUNT)) { printf("Problem starting counters 1."); } #endif barrier_cross(d->barrier); /* Is the first op an update? */ unext = (rand_range_re(&d->seed, 100) - 1 < d->update); #ifdef DISTRIBUTION_EXPERIMENT while (1) #else while (*running) #endif { if (d->es) { // event simulator experiment if (d->lin) { if (!empty(d->linden_set)) { d->nb_remove++; pval_t pval = deletemin(d->linden_set, d); d->nb_removed++; // printf("%d %d\n", pval, deps[pval][0]); int i = 0; val_t dep; while ((dep = deps[pval][i]) != -1 && i < MAX_DEPS) { d->nb_add++; if (insert(d->linden_set, dep, dep)) { d->nb_added++; } i++; } } } else { if (d->set->head->next[0]->next[0] != NULL) {// set not empty d->nb_remove++; if (d->sl) { // spray list if (spray_delete_min(d->set, &val, d)) { d->nb_removed++; } else { continue; } } else if (d->pq) { // lotan_shavit pq if (lotan_shavit_delete_min(d->set, &val, d)) { d->nb_removed++; // continue; // TODO: maybe try remove this to simulate task handling (dependency checks still occur) } else { continue; } } // struct timespec ten_usec; // ten_usec.tv_sec = 0; // ten_usec.tv_nsec = 10000; // nanosleep(&ten_usec, NULL); // dependency handling int i = 0; val_t dep; while ((dep = deps[val][i]) != -1 && i < MAX_DEPS) { if (!sl_contains(d->set, dep, TRANSACTIONAL)) { // dependent has been removed, need to add it again if (sl_add(d->set, dep, TRANSACTIONAL)) { // check if insert actually succeeded (otherwise someone else did it first) d->nb_added++; } d->nb_add++; } i++; } } } } else { // not event simulator if (unext) { // update if (last < 0) { // add val = rand_range_re(&d->seed, d->range); if (d->lin) { pval = val; insert(d->linden_set, pval, pval); d->nb_added++; last = pval; } else { // not linden if (sl_add(d->set, val, TRANSACTIONAL)) { d->nb_added++; last = val; } } d->nb_add++; } else { // remove if (d->pq) { if (lotan_shavit_delete_min(d->set, &val, d)) { d->nb_removed++; if (d->first_remove == -1) { d->first_remove = val; } } last = -1; } else if (d->sl) { if (spray_delete_min(d->set, &val, d)) { d->nb_removed++; if (d->first_remove == -1) { d->first_remove = val; } last = -1; } } else if (d->lin) { if ((pval = deletemin(d->linden_set, d))) { d->nb_removed++; if (d->first_remove == -1) { d->first_remove = pval; } last = -1; } } else if (d->alternate) { // alternate mode (default) if (sl_remove(d->set, last, TRANSACTIONAL)) { d->nb_removed++; if (d->first_remove == -1) { d->first_remove = val; } } last = -1; } else { /* Random computation only in non-alternated cases */ val = rand_range_re(&d->seed, d->range); /* Remove one random value */ if (sl_remove_succ(d->set, val, TRANSACTIONAL)) { d->nb_removed++; if (d->first_remove == -1) { d->first_remove = val; } /* Repeat until successful, to avoid size variations */ last = -1; } } d->nb_remove++; } } else { // read if (d->alternate) { if (d->update == 0) { if (last < 0) { val = d->first; last = val; } else { // last >= 0 val = rand_range_re(&d->seed, d->range); last = -1; } } else { // update != 0 if (last < 0) { val = rand_range_re(&d->seed, d->range); //last = val; } else { val = last; } } } else val = rand_range_re(&d->seed, d->range); PF_START(2); if (sl_contains(d->set, val, TRANSACTIONAL)) d->nb_found++; PF_STOP(2); d->nb_contains++; } /* Is the next op an update? */ if (d->effective) { // a failed remove/add is a read-only tx unext = ((100 * (d->nb_added + d->nb_removed)) < (d->update * (d->nb_add + d->nb_remove + d->nb_contains))); } else { // remove/add (even failed) is considered as an update unext = (rand_range_re(&d->seed, 100) - 1 < d->update); } } #ifdef DISTRIBUTION_EXPERIMENT if (d->first_remove != -1) { break; //only one run } #endif } #ifdef PAPI if (PAPI_OK != PAPI_read_counters(g_values[d->id], G_EVENT_COUNT)) { printf("Problem reading counters 2."); } #endif /* Free transaction */ TM_THREAD_EXIT(); PF_PRINT; return NULL; }
main(int argc, char *argv[]) { float **a,**b,**c; int n,n1,n2; int i,j; //double t0,t1; struct timeval t0,t1; long mtime, seconds, useconds; // Using PAPI - from countloop.c if (PAPI_VER_CURRENT != PAPI_library_init(PAPI_VER_CURRENT)) ehandler("PAPI_library_init error."); const size_t EVENT_MAX = PAPI_num_counters(); // Suppressing output // printf("# Max counters = %zd\n", EVENT_MAX); if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS)) ehandler("Cannot count PAPI_TOT_INS."); if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS)) ehandler("Cannot count PAPI_FP_OPS."); if (PAPI_OK != PAPI_query_event(PAPI_L1_DCM)) ehandler("Cannot count PAPI_L1_DCM."); size_t EVENT_COUNT = 3; int events[] = { PAPI_TOT_INS, PAPI_FP_OPS, PAPI_L1_DCM }; long long values[EVENT_COUNT]; // Take size from args, not prompt // printf("Enter n: "); scanf("%d",&n); printf("n = %d\n",n); n = atoi(argv[1]); //printf("Enter n1: "); scanf("%d",&n1); printf("n1 = %d\n",n1); //printf("Enter n2: "); scanf("%d",&n2); printf("n2 = %d\n",n2); // To conform to the other matrix functions n1 = floor(sqrt(n)); n2 = n1; n = n1*n2; //printf("n = %d X %d = %d\n",n1,n2,n); a = matrix(1,n,1,n); for (i=1;i<=n;i++) for (j=1;j<=n;j++) a[i][j] = i+j; b = matrix(1,n,1,n); for (i=1;i<=n;i++) for (j=1;j<=n;j++) b[i][j] = i-j; //#ifdef PRINT //print_matrix(a,1,n,1,n); //printf("\n"); */ //print_matrix(b,1,n,1,n); //printf("\n"); */ //#endif //t0 = get_seconds(); //c = matrix_prod(n,n,n,n,a,b); //t1 = get_seconds(); //printf("Time for matrix_prod = %f sec\n",t1-t0); //t0 = get_seconds(); gettimeofday(&t0, NULL); // Start PAPI PAPI_start_counters(events, EVENT_COUNT); if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); c = block_prod(n1,n1,n1,n2,n2,n2,a,b); if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); //t1 = get_seconds(); //printf("Time for block_prod = %f sec\n",t1-t0); gettimeofday(&t1, NULL); seconds = t1.tv_sec - t0.tv_sec; useconds = t1.tv_usec - t0.tv_usec; mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; //printf("Time for matrix_prod = %f sec\n",t1-t0); printf("%d\t%lld\t%lld\t%lld\t%ld\n", n, values[0], values[1], values[2], mtime); }
void* Thread(void *userData) { ThreadInfo *info = (ThreadInfo*) userData; Context *c = info->c; int index = info->index; int threadCount = c->threadCount; int64_t repetitionCount = c->repetitionCount; uint64_t me = 0x1 << index; uint64_t full = 0x0000000000000000; uint64_t copy; //thread local copy of the entry/exit barrier for (int i = 0; i < threadCount; ++i) { full |= 0x1 << i; } // set thread affinity cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(index, &cpuset); assert(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) == 0); //DEBUG //pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); //printf("%i uses cpus: ", index); //for (int i = 0; i < threadCount; ++i) { // if (CPU_ISSET(i, &cpuset)) { // printf("%i, ", i); // } //} //printf("\n"); int threadToBeRecorded = -1; int papiEvents[3] = {0x8000003b, 0x80000000, 0x80000002}; long long papiStart[3] = {0, 0, 0}; long long papiEnd[3] = {0, 0, 0}; if (index == threadToBeRecorded) { int ret = PAPI_start_counters(papiEvents, 3); if (ret != 0) { printf("thread %i: PAPI_start_counters %i\n", index, ret); assert(0); } ret = PAPI_read_counters(papiStart, 3); if (ret != 0) { printf("thread %i: PAPI_read_counters %i\n", index, ret); assert(0); } } //unlink("a"); //FILE *log = fopen("a", "a"); for(int64_t repetition = 0; repetition < repetitionCount; repetition++){ if (c->left == 0) { /* *** if () { UNIFIED ENTRY *********************/ /* run to wall and wait busily */ do { copy = c->entry; //fprintf(log, "%i r %lli\n", prime, (long long) copy); //fflush(log); if ((copy & me) == 0) { copy |= me; c->entry = copy; //fprintf(log, "%i w %lli\n", prime, (long long) copy); //fflush(log); } }while (copy != full && c->left == 0); c->left = 1; c->exit = 0x0000000000000000; } else if (c->left == 1) { /* *** } else if () { UNIFIED ENTRY *******/ for (int i = 0; i < threadCount - 1; ++i) { if (c->successfulBarrierVisitsCount[i] != c->successfulBarrierVisitsCount[i+1]) { printf("thread %i and %i are not equal at %lli %lli\n", i, i+1, (long long)c->successfulBarrierVisitsCount[i], (long long)c->successfulBarrierVisitsCount[i+1]); ++c->outOfSyncCount; assert(0); } } /* wait busily until everyone has left the barrier */ do { copy = c->exit; if ((copy & me) == 0) { copy |= me; c->exit = copy; } }while (copy != full && c->left == 1); c->left = 0; c->entry = 0x0000000000000000; ++(c->successfulBarrierVisitsCount[index]); } /* *** } UNIFIED ENTRY *********************************************/ } if (index == threadToBeRecorded) { int ret = PAPI_stop_counters(papiEnd, 3); if (ret != 0) { printf("%i: PAPI_stop_counters %i\n", index, ret); assert(0); } printf("thread %i: papi counter 0: %lli - %lli = %lli\n", index, papiEnd[0], papiStart[0], papiEnd[0] - papiStart[0]); printf("thread %i: papi counter 1: %lli - %lli = %lli\n", index, papiEnd[1], papiStart[1], papiEnd[1] - papiStart[1]); printf("thread %i: papi counter 2: %lli - %lli = %lli\n", index, papiEnd[2], papiStart[2], papiEnd[2] - papiStart[2]); printf("\n"); } return NULL; }
int main() { int level, numlevels; /* Variaveis das medicoes reais (hwloc) */ hwloc_uint64_t real_cache[3]; unsigned real_line[3]; hwloc_topology_t topology; hwloc_obj_t obj; /* Variaveis das medicoes estimadas (PAPI) */ int events[3] = { PAPI_L1_DCM, PAPI_L2_DCM, PAPI_L3_DCM }; long long misses[3]; int papilevels = 3; /* Inicializa hwloc */ if (hwloc_topology_init(&topology)) { fprintf(stderr, "Erro em hwloc_topology_init\n"); exit(1); } hwloc_topology_load(topology); /* Pega tamanhos reais de cache e line */ level = 0; for (obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, 0); obj; obj = obj->parent) { if (obj->type == HWLOC_OBJ_CACHE) { real_cache[level] = obj->attr->cache.size; real_line[level] = obj->attr->cache.linesize; level++; } } numlevels = level; /* Comeca a contar eventos */ while (PAPI_start_counters(events, papilevels) != PAPI_OK) { papilevels--; } /* TODO: alguma coisa para estimar */ /* Le os contadores */ if (PAPI_read_counters(misses, papilevels) != PAPI_OK) { fprintf(stderr, "Erro em PAPI_read_counters\n"); exit(1); } for (level = 0; level < papilevels; level++) { printf("L%d cache miss: %lld\n", level+1, misses[level]); } printf("\n"); /* Imprime tabela */ printf("Nível Linha Real Cache Real Linha Est. Cache Est.\n"); printf("----- ---------- ---------- ---------- ----------\n"); for (level = 0; level < 3; level++) { printf("%5d ", level+1); if (level < numlevels) { printf("%10u ", real_line[level]); printf("%9lluK ", (unsigned long long) real_cache[level] / 1024); if (level < papilevels) { /* TODO: imprimir valores estimados no lugar dos reais */ printf("%10u ", real_line[level]); printf("%9lluK", (unsigned long long) real_cache[level] / 1024); } else { printf(" N/D N/D"); } } else { printf(" N/D N/D N/D N/D"); } printf("\n"); } return 0; }
int main(int argc, char *argv[]) { double *collideField = NULL; double *streamField = NULL; char problem[100]; char pgmInput[1000]; int *flagField = NULL; clock_t begin, end; double time_spent; struct timeval time_start, time_end; int xlength[3], timesteps, timestepsPerPlotting; double tau, bddParams[7]; long long counters[3]; int PAPI_events[] = { PAPI_TOT_CYC, PAPI_L2_DCM, PAPI_L2_DCA }; PAPI_library_init(PAPI_VER_CURRENT); #ifdef DEBUG double * exactCollideField; // used for debugging #endif if(readParameters(xlength, &tau, bddParams, ×teps, ×tepsPerPlotting, problem, pgmInput, argc, argv) == 0) { begin = clock(); gettimeofday(&time_start, NULL); collideField = (double*) malloc((size_t) sizeof(double) * PARAMQ * (xlength[0] + 2)*(xlength[1] + 2)*(xlength[2] + 2)); streamField = (double*) malloc((size_t) sizeof(double) * PARAMQ * (xlength[0] + 2)*(xlength[1] + 2)*(xlength[2] + 2)); flagField = (int *) malloc((size_t) sizeof (int) * (xlength[0] + 2)*(xlength[1] + 2)*(xlength[2] + 2)); initialiseFields(collideField, streamField, flagField, xlength, problem, pgmInput); /** debugging code */ // /* output the flagField */ // char szFileName2[80]; // FILE *fp2 = NULL; // sprintf( szFileName2, "Testdata/%s/flagField.dat", problem); // fp2 = fopen(szFileName2,"w"); // for (int i = 0; i < (xlength[0] + 2) * (xlength[1] + 2) * (xlength[2] + 2); i++) // fprintf(fp2, "%d\n", flagField[i]); /** debugging code end */ printf("Progress: "); PAPI_start_counters( PAPI_events, 3 ); for(int t = 0; t < timesteps; t++) { double *swap = NULL; #ifdef _AVX_ doStreamingAndCollisionAVX(collideField, streamField, flagField, xlength, tau); #else doStreamingAndCollision(collideField, streamField, flagField, xlength, tau); #endif // _AVX_ swap = collideField; collideField = streamField; streamField = swap; treatBoundary(collideField, flagField, bddParams, xlength); if (t % timestepsPerPlotting == 0) { // writeVtkOutput(collideField, flagField, "./Paraview/output", (unsigned int) t / timestepsPerPlotting, xlength); /** debugging code */ // /* create reference files */ // FILE *fp = NULL; // char szFileName[80]; // sprintf( szFileName, "Testdata/%s/%i.dat", problem, t / timestepsPerPlotting ); // fp = fopen(szFileName,"w"); // for (int i = 0; i < PARAMQ * (xlength[0] + 2) * (xlength[1] + 2) * (xlength[2] + 2); i++) // fprintf(fp, "%0.7f\n", collideField[i]); /* check correctness */ #ifdef DEBUG exactCollideField = (double *) malloc ( ( size_t ) sizeof(double) * PARAMQ * (xlength[0] + 2) * (xlength[1] + 2) * (xlength[2] + 2)); FILE *fp = NULL; unsigned int line = 0; int noOfReadEntries; int error = 0; char szFileName[80]; sprintf( szFileName, "Testdata/%s/%i.dat", problem, t / timestepsPerPlotting ); fp = fopen(szFileName,"r"); if (fp != NULL) { for (line = 0; line < PARAMQ * (xlength[0] + 2) * (xlength[1] + 2) * (xlength[2] + 2); line++) { noOfReadEntries = fscanf(fp,"%lf",&exactCollideField[line]); if (noOfReadEntries != 1) continue; } } fclose(fp); for (int i = 0; i < PARAMQ; i++) for (int z = 1; z <= xlength[2]; z++) for (int y = 1; y <= xlength[1]; y++) for(int x = 1; x <= xlength[0]; x++) if (flagField[z * (xlength[0] + 2) * (xlength[1] + 2) + y * (xlength[0] + 2) + x] == FLUID) if (fabs(collideField[z * (xlength[0] + 2) * (xlength[1] + 2) + y * (xlength[0] + 2 ) + x + (xlength[0] + 2) * (xlength[1] + 2) * (xlength[2] + 2) * i] - exactCollideField[PARAMQ * (z * (xlength[0] + 2) * (xlength[1] + 2) + y * (xlength[0] + 2) + x) + i]) > 1e-4) error = 1; if (error) printf("ERROR: Different collideField in timestep %d\n", t); free(exactCollideField); #endif // DEBUG /** end of debugging code */ } PAPI_read_counters( counters, 3 ); int pct = ((float) t / timesteps) * 100; printf("\b\b\b%02d%%", pct); fflush(stdout); } printf("\b\b\b\b100%%\n"); end = clock(); gettimeofday(&time_end, NULL); time_spent = (double) (end - begin) / CLOCKS_PER_SEC; printf("Running time: %.2fs\n", time_spent); printf("Running time (Wall clock): %.2fs\n", ( (double) (( time_end.tv_sec - time_start.tv_sec) * 1000000u + time_end.tv_usec - time_start.tv_usec) )/ 1e6); printf("MLUPS: %.3f\n", ((double) (xlength[0] + 2) * (xlength[1] + 2) * (xlength[2] + 2) * timesteps) / (1000000.0 * ((time_end.tv_sec - time_start.tv_sec) * 1000000u + time_end.tv_usec - time_start.tv_usec) / 1e6)); printf("%lld L2 cache misses (%.3lf%% misses) in %lld cycles\n", counters[1], (double)counters[1] / (double)counters[2] * 100, counters[0] ); free(collideField); free(streamField); free(flagField); } return 0; }
int main(int argc, char *argv[]) { if (argc < 4) { printf("Usage: %s data_type(text or bin) input_file output_file\n", argv[0]); return EXIT_FAILURE; } char *file_type = argv[1]; char *file_in = argv[2]; char *file_out = argv[3]; char *str1 = "SU.vtk"; char *str2 = "VAR.vtk"; char *str3 = "CGUP.vtk"; char *file_perf = "pstats.dat"; int status = 0; /** internal cells start and end index*/ int nintci, nintcf; /** external cells start and end index. The external cells are only ghost cells. They are accessed only through internal cells*/ int nextci, nextcf; /** link cell-to-cell array. Stores topology information*/ int **lcc; /** red-black colouring of the cells*/ int *nboard; /** boundary coefficients for each volume cell */ double *bs, *be, *bn, *bw, *bl, *bh, *bp, *su; /**parameter used for volmesh and reading binary input file */ int* nodeCnt; int*** points; int*** elems; /**Measured Performance and Papi parameters*/ long long *values_i = (long long *) calloc(sizeof(long long), 4); long long *values_c = (long long *) calloc(sizeof(long long), 4); long long *values_o = (long long *) calloc(sizeof(long long), 4); double *mflops = (double *) calloc(sizeof(double), 3); double *L1mira = (double *) calloc(sizeof(double), 3); double *Lmirate = (double *) calloc(sizeof(double), 3); double *util = (double *) calloc(sizeof(double), 3); long long *et = (long long *) calloc(sizeof(long long), 3); long long start_cycles, start_usec,end_cycles_1, end_usec_1, end_cycles_2, end_cycles_3, end_usec_2, end_usec_3; /**In cluster mpp_inter L1 and L2 events can not computed at the same time, so set into two groups*/ int Events[NUM_EVENTS]={PAPI_L2_TCM,PAPI_L2_TCA,PAPI_FP_INS,PAPI_TOT_CYC}; // int Events[NUM_EVENTS]={PAPI_L1_TCM,PAPI_L1_TCA,PAPI_FP_INS,PAPI_TOT_CYC}; /**start HW counters and execution time recorder*/ if ( PAPI_start_counters( Events, NUM_EVENTS ) != PAPI_OK ) printf("Fail to start PAPI counter\n"); start_cycles = PAPI_get_real_cyc(); // Gets the starting time in clock cycles start_usec = PAPI_get_real_usec(); // Gets the starting time in microseconds /* initialization */ // read-in the input file int f_status; if (strcmp(file_type,"text") == 0) { f_status = read_formatted(file_in, &nintci, &nintcf, &nextci, &nextcf, &lcc, &bs, &be, &bn, &bw, &bl, &bh, &bp, &su, &nboard); } else if (strcmp(file_type,"bin") == 0) { f_status = read_formatted_bin(file_in, &nintci, &nintcf, &nextci, &nextcf, &lcc, &bs, &be, &bn, &bw, &bl, &bh, &bp, &su,&nboard); } else { printf ("Input file format is nor correct\n"); return EXIT_FAILURE; } if (f_status != 0){ printf("failed to initialize data!\n"); return EXIT_FAILURE; } // allocate arrays used in gccg int nomax = 3; /** the reference residual*/ double resref = 0.0; /** the ratio between the reference and the current residual*/ double ratio; /** array storing residuals */ double* resvec = (double *) calloc(sizeof(double), (nintcf + 1)); /** the variation vector -> keeps the result in the end */ double* var = (double *) calloc(sizeof(double), (nextcf + 1)); /** the computation vectors */ double* direc1 = (double *) calloc(sizeof(double), (nextcf + 1)); double* direc2 = (double *) calloc(sizeof(double), (nextcf + 1)); /** additional vectors */ double* cgup = (double *) calloc(sizeof(double), (nextcf + 1)); double* oc = (double *) calloc(sizeof(double), (nintcf + 1)); double* cnorm = (double *) calloc(sizeof(double), (nintcf + 1)); double* adxor1 = (double *) calloc(sizeof(double), (nintcf + 1)); double* adxor2 = (double *) calloc(sizeof(double), (nintcf + 1)); double* dxor1 = (double *) calloc(sizeof(double), (nintcf + 1)); double* dxor2 = (double *) calloc(sizeof(double), (nintcf + 1)); /**store volume information*/ int nc=0; // initialize the reference residual for ( nc = nintci; nc <= nintcf; nc++) { resvec[nc] = su[nc]; resref = resref + resvec[nc] * resvec[nc]; } resref = sqrt(resref); if (resref < 1.0e-15){ printf("i/o - error: residue sum less than 1.e-15 - %lf\n", resref); return EXIT_FAILURE; } // initialize the arrays for (nc = 0; nc <= 10; nc++){ oc[nc] = 0.0; cnorm[nc] = 1.0; } for (nc = nintci; nc <= nintcf; nc++){ cgup[nc] = 0.0; var[nc] = 0.0; } for (nc = nextci; nc <= nextcf; nc++){ var[nc] = 0.0; cgup[nc] = 0.0; direc1[nc] = 0.0; bs[nc] = 0.0; be[nc] = 0.0; bn[nc] = 0.0; bw[nc] = 0.0; bl[nc] = 0.0; bh[nc] = 0.0; } for (nc = nintci; nc <= nintcf; nc++){ cgup[nc] = 1.0 / bp[nc]; } int if1 = 0; int if2 = 0; int iter = 1; int nor = 1; int nor1 = nor - 1; /* finished initalization */ /*read PAPI HW counters and caculate performance of input phase*/ if ( PAPI_read_counters( values_i, NUM_EVENTS ) != PAPI_OK ){ printf("fail to stop papi counter"); } Lmirate[0] = (double) values_i[0] / values_i[1]; end_usec_1 = PAPI_get_real_usec(); mflops[0] = (double) values_i[2] / (end_usec_1-start_usec); util[0] = mflops[0] / PEAKPER; /* start computation loop */ while (iter < 10000){ /* start phase 1 */ // update the old values of direc for (nc = nintci; nc <= nintcf; nc++){ direc1[nc] = direc1[nc] + resvec[nc] * cgup[nc]; } // compute new guess (approximation) for direc for (nc = nintci; nc <= nintcf; nc++){ direc2[nc] = bp[nc] * direc1[nc] - bs[nc] * direc1[lcc[0][nc]] - bw[nc] * direc1[lcc[3][nc]] - bl[nc] * direc1[lcc[4][nc]] - bn[nc] * direc1[lcc[2][nc]] - be[nc] * direc1[lcc[1][nc]] - bh[nc] * direc1[lcc[5][nc]]; } /* end phase 1 */ /* start phase 2 */ // execute normalization steps double oc1, oc2, occ; if (nor1 == 1){ oc1 = 0; occ = 0; for (nc = nintci; nc <= nintcf; nc++){ occ = occ + adxor1[nc] * direc2[nc]; } oc1 = occ / cnorm[1]; for (nc = nintci; nc <= nintcf; nc++){ direc2[nc] = direc2[nc] - oc1 * adxor1[nc]; direc1[nc] = direc1[nc] - oc1 * dxor1[nc]; } if1++; }else if (nor1 == 2){ oc1 = 0; occ = 0; for (nc = nintci; nc <= nintcf; nc++){ occ = occ + adxor1[nc] * direc2[nc]; } oc1 = occ / cnorm[1]; oc2 = 0; occ = 0; for (nc = nintci; nc <= nintcf; nc++){ occ = occ + adxor2[nc] * direc2[nc]; } oc2 = occ / cnorm[2]; for (nc = nintci; nc <= nintcf; nc++){ direc2[nc] = direc2[nc] - oc1 * adxor1[nc] - oc2 * adxor2[nc]; direc1[nc] = direc1[nc] - oc1 * dxor1[nc] - oc2 * dxor2[nc]; } if2++; } cnorm[nor] = 0; double omega = 0; // compute the new residual for (nc = nintci; nc <= nintcf; nc++){ cnorm[nor] = cnorm[nor] + direc2[nc] * direc2[nc]; omega = omega + resvec[nc] * direc2[nc]; } omega = omega / cnorm[nor]; double resnew = 0.0; for (nc = nintci; nc <= nintcf; nc++){ var[nc] = var[nc] + omega * direc1[nc]; resvec[nc] = resvec[nc] - omega * direc2[nc]; resnew = resnew + resvec[nc] * resvec[nc]; } resnew = sqrt(resnew); ratio = resnew / resref; // exit on no improvements of residual if (ratio <= 1.0e-10){ break; } iter++; // prepare additional arrays for the next iteration step if (nor == nomax){ nor = 1; }else{ if (nor == 1){ for (nc = nintci; nc <= nintcf; nc++){ dxor1[nc] = direc1[nc]; adxor1[nc] = direc2[nc]; } } else if (nor == 2){ for (nc = nintci; nc <= nintcf; nc++){ dxor2[nc] = direc1[nc]; adxor2[nc] = direc2[nc]; } } nor++; } nor1 = nor - 1; }/* end phase 2 */ /* finished computation loop */ /*read PAPI HW counters and caculate performance of computation phase*/ end_cycles_2 = PAPI_get_real_cyc(); // Gets the ending time in clock cycles end_usec_2 = PAPI_get_real_usec(); // Gets the ending time in microseconds if ( PAPI_read_counters( values_c, NUM_EVENTS ) != PAPI_OK ){ printf("fail to read papi counter"); } Lmirate[1] = (double) values_c[0]/values_c[1]; mflops[1] = (double) values_c[2] / ( end_usec_2-end_usec_1 ); util[1] = mflops[1] / PEAKPER; /* write output file */ if ( write_result(file_in, file_out, nintci, nintcf, var, iter, ratio) != 0 ) printf("error when trying to write to file %s\n", file_out); //transfer volume to mesh if (vol2mesh(nintci, nintcf, lcc, &nodeCnt, &points, &elems) != 0 ){ printf("error when trying to converge topology to volume"); } //write output to vtk file if (write_result_vtk(str1, nintci, nintcf, nodeCnt, points, elems, su) != 0){ printf("error when write SU to vtk file"); } if (write_result_vtk(str2, nintci, nintcf, nodeCnt, points, elems, var) != 0){ printf("error when write VAR to vtk file"); } if (write_result_vtk(str3, nintci, nintcf, nodeCnt, points, elems, cgup) != 0){ printf("error when write CGUP to vtk file"); } /*read PAPI HW counters and caculate performance of output phase*/ if ( PAPI_stop_counters( values_o, NUM_EVENTS ) != PAPI_OK ){ printf("fail to stop papi counter"); } Lmirate[2] = (double) values_o[0]/values_o[1]; end_cycles_3 = PAPI_get_real_cyc(); // Gets the ending time in clock cycles end_usec_3 = PAPI_get_real_usec(); // Gets the ending time in microseconds mflops[2] = (double) (values_o[2])/(end_usec_3-end_usec_2); util[2] = mflops[2] / PEAKPER; /** Write all measured performance to pstats.dat*/ et[0] = end_usec_1-start_usec; et[1] = end_usec_2-end_usec_1; et[2] = end_usec_3-end_usec_2; if (write_result_dat(file_perf, values_i,values_c, values_o,Lmirate, et, mflops, util) != 0 ){ printf("error when write measured performance to data file"); } /* Free all the dynamically allocated memory */ free(direc2); free(direc1); free(dxor2); free(dxor1); free(adxor2); free(adxor1); free(cnorm); free(oc); free(var); free(cgup); free(resvec); free(su); free(bp); free(bh); free(bl); free(bw); free(bn); free(be); free(bs); printf("Simulation completed successfully!\n"); return EXIT_SUCCESS; }
void* sssp(void *data) { thread_data_t *d = (thread_data_t *)data; /* Create transaction */ set_cpu(the_cores[d->id]); /* Wait on barrier */ ssalloc_init(); PF_CORRECTION; seeds = seed_rand(); #ifdef PIN int id = d->id; // int cpu = 40*(id/40) + 4*(id%10) + (id%40)/10; int cpu = 4*(id%20) + id/20; // printf("Pinning %d to %d\n",id,cpu); pin(pthread_self(), cpu); // pin(pthread_self(), id); #endif #ifdef PAPI if (PAPI_OK != PAPI_start_counters(g_events, G_EVENT_COUNT)) { printf("Problem starting counters 1."); } #endif barrier_cross(d->barrier); // Begin SSSP int fail = 0; // int radius = 0; while (1) { val_t node; slkey_t dist_node; // print_skiplist(d->set); while (1) { if (d->sl) { if (spray_delete_min_key(d->set, &dist_node, &node, d)) break; // keep trying until get a node } else if (d->pq) { if (lotan_shavit_delete_min_key(d->set, &dist_node, &node, d)) break; } else if (d->lin) { node = (val_t) deletemin_key(d->linden_set, &dist_node, d); break; } else { printf("error: no queue selected\n"); exit(1); // TODO: grace } if (dist_node == -1) { // flag that list is empty break; } dist_node = 0; } if (dist_node == -1) { // list is empty; TODO make sure threads don't quit early fail++; if (fail > 20*d->nb_threads) { // TODO: really need a better break condition... break; } continue; } fail = 0; if (dist_node != nodes[node].dist) continue; // dead node nodes[node].times_processed++; int i; for (i = 0;i < nodes[node].deg;i++) { int v = nodes[node].adj[i]; int w = nodes[node].weights[i]; slkey_t dist_v = nodes[v].dist; // printf("v=%d dist_v=%d\n", v, dist_v); if (dist_v == -1 || dist_node + w < dist_v) { // found better path to v // printf("attempting cas...\n"); // printf("nodes[v].dist=%d dist_v=%d dist_node=%d\n", nodes[v].dist, dist_v, dist_node); int res = ATOMIC_CAS_MB(&nodes[v].dist, dist_v, dist_node+w); // printf("%d nodes[%d].dist=%d\n", res, v, nodes[v].dist); if (res) { if (d->pq || d->sl) { sl_add_val(d->set, dist_node+w, v, TRANSACTIONAL); // add to queue only if CAS is successful } else if (d->lin) { insert(d->linden_set, dist_node+w, v); } d->nb_add++; // if (dist_node+1 > radius) { // radius = dist_node+1; // printf("radius %d\n", radius); // } } } } } // End SSSP #ifdef PAPI if (PAPI_OK != PAPI_read_counters(g_values[d->id], G_EVENT_COUNT)) { printf("Problem reading counters 2."); } #endif PF_PRINT; return NULL; }
main(int argc, char *argv[]) { float **a,**b,**c; int n; int NB; int i,j; int x; //double t0,t1; struct timeval t0,t1; long mtime, seconds, useconds; // Using PAPI - from countloop.c if (PAPI_VER_CURRENT != PAPI_library_init(PAPI_VER_CURRENT)) ehandler("PAPI_library_init error."); const size_t EVENT_MAX = PAPI_num_counters(); // Suppressing output // printf("# Max counters = %zd\n", EVENT_MAX); if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS)) ehandler("Cannot count PAPI_TOT_INS."); if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS)) ehandler("Cannot count PAPI_FP_OPS."); if (PAPI_OK != PAPI_query_event(PAPI_L1_DCM)) ehandler("Cannot count PAPI_L1_DCM."); size_t EVENT_COUNT = 3; int events[] = { PAPI_TOT_INS, PAPI_FP_OPS, PAPI_L1_DCM }; long long values[EVENT_COUNT]; // Take size from args, not prompt // printf("Enter n: "); scanf("%d",&n); printf("n = %d\n",n); n = atoi(argv[1]); NB = atoi(argv[2]); a = matrix(1,n,1,n); for (i=1; i<=n; i++) for (j=1; j<=n; j++) a[i][j] = i+j; b = matrix(1,n,1,n); for (i=1; i<=n; i++) for (j=1; j<=n; j++) b[i][j] = i-j; //t0 = get_seconds(); gettimeofday(&t0, NULL); // Start PAPI PAPI_start_counters(events, EVENT_COUNT); if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); //for (x=0;x<1000;x++){ c = matrix_prod(n,n,n,n,a,b,NB); //} if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); //t1 = get_seconds(); gettimeofday(&t1, NULL); seconds = t1.tv_sec - t0.tv_sec; useconds = t1.tv_usec - t0.tv_usec; mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; //printf("Time for matrix_prod = %f sec\n",t1-t0); printf("%d\t%lld\t%lld\t%lld\t%ld\n", n, values[0], values[1], values[2], mtime); }