static void *do_spmv_thread_main(void *arg) { spm_mt_thread_t *spm_mt_thread = (spm_mt_thread_t *) arg; SPMV_NAME(_fn_t) *spmv_mt_fn = spm_mt_thread->spmv_fn; setaffinity_oncpu(spm_mt_thread->cpu); int i; tsc_t total_tsc, thread_tsc; tsc_init(&total_tsc); tsc_init(&thread_tsc); tsc_start(&total_tsc); for (i = 0; i < loops_nr; i++) { pthread_barrier_wait(&barrier); tsc_start(&thread_tsc); spmv_mt_fn(spm_mt_thread->spm, spm_mt_thread->data, y); tsc_pause(&thread_tsc); pthread_barrier_wait(&barrier); } tsc_pause(&total_tsc); spm_mt_thread->secs = tsc_getsecs(&thread_tsc); secs = tsc_getsecs(&total_tsc); tsc_shut(&thread_tsc); tsc_shut(&total_tsc); return (void *) 0; }
/** * @brief Timer handler procedure * * This is not a realistic workload and it is a demonstration code only. * * It runs couple thousand of iterations and each iteration is randomizing * memory locations to run a number of arithmetic operations on them. * * @param sig UNUSED * @param si UNUSED * @param uc UNUSED */ static void timer_handler(int sig, siginfo_t *si, void *uc) { const int num_iterations = 5000; int *p = (int *) timer_data_ptr; const size_t sz = timer_data_size / sizeof(int); int m; (void) (sig); (void) (si); (void) (uc); tsc_start(&timer_prof); /* START - "latency sensitive" code */ for (m = 0; m < num_iterations; m++) { const size_t stride = 5; const int idx0 = timer_rand() % (sz - stride); const int idx1 = timer_rand() % (sz - stride); size_t n; for (n = 0; n < stride; n++) p[idx0 + n] = 2 * p[idx1 + n] + p[idx0 + n]; } /* END - "latency sensitive" code */ tsc_end(&timer_prof, 1); }
int main(int argc, char **argv) { pid_t pid; int status; char **new_argv; prfcnt_t prfcnt; tsc_t timer; cpu_set_t cpu_set; int err; if ( argc < 2){ printf("Usage: %s <cmd> (args)\n", argv[0]); exit(1); } new_argv = &argv[1]; /* * CPU affinity is inherited across a fork() */ CPU_ZERO(&cpu_set); CPU_SET(0,&cpu_set); err = sched_setaffinity(getpid(), sizeof(cpu_set_t), &cpu_set); if (err){ perror("sched_setaffinity"); exit(1); } if ( (pid = fork()) < 0){ perror("fork"); exit(1); } tsc_init(&timer); prfcnt_init(&prfcnt,0,PRFCNT_FL_T0|PRFCNT_FL_T1); /* * FIXME: Is this efficient enough ? Could it be done better ? */ if (pid) { prfcnt_start(&prfcnt); tsc_start(&timer); wait(&status); tsc_pause(&timer); prfcnt_pause(&prfcnt); } else { execv(argv[1],new_argv); perror("execv"); exit(1); } tsc_report(&timer); prfcnt_report(&prfcnt); return 0; }
int main(void) { const long iterations = 1000000000; struct timespec ts; uint64_t tsc; clock_start (&ts); tsc_start(&tsc); volatile int asignto = 0; for (long i = 0; i < iterations; i++) { asignto = function_call(i); } const long long unsigned delta = clock_end(&ts); const long long unsigned delta_tsc = tsc_end(&tsc); (void) asignto; printf("%ld spins in %lluns (%.1fns/spin, %.1f clocks/spin)\n", iterations, delta, (delta / (double) iterations),delta_tsc / (double)iterations); return 0; }
static void *do_spmv_thread_main_swap(void *arg) { spm_mt_thread_t *spm_mt_thread; #ifdef SPMV_PRFCNT prfcnt_t *prfcnt; #endif SPMV_NAME(_fn_t) *spmv_mt_fn; tsc_t tsc; spm_mt_thread = arg; spmv_mt_fn = spm_mt_thread->spmv_fn; #ifdef SPMV_PRFCNT prfcnt = (prfcnt_t *) spm_mt_thread->data; #endif setaffinity_oncpu(spm_mt_thread->cpu); VECTOR_NAME(_init_rand_range)(x, (ELEM_TYPE) -1000, (ELEM_TYPE) 1000); // Assert this is a square matrix and swap is ok. assert(x->size == y->size); tsc_init(&tsc); tsc_start(&tsc); #ifdef SPMV_PRFCNT prfcnt_init(prfcnt, spm_mt_thread->cpu, PRFCNT_FL_T0 | PRFCNT_FL_T1); prfcnt_start(prfcnt); #endif int i; for (i = 0; i < loops_nr; i++) { pthread_barrier_wait(&barrier); spmv_mt_fn(spm_mt_thread->spm, x, y); pthread_barrier_wait(&barrier); SWAP(x, y); } tsc_pause(&tsc); #ifdef SPMV_PRFCNT prfcnt_pause(prfcnt); #endif secs = tsc_getsecs(&tsc); tsc_shut(&tsc); return NULL; }
int main(int argc, const char *argv[]) { unsigned nthreads; size_t nints; int sum1, sum2; int *arr; nints = 0; if (argc > 1) nints = atol(argv[1]); if (nints == 0) nints = 100000; #pragma omp parallel #pragma omp master nthreads = omp_get_num_threads(); printf("Number of threads: %u\n", nthreads); printf("number of ints: %lu\n", nints); arr = arr_int_mkrand(nints, &sum1); sum2 = 0; tsc_t t; tsc_init(&t); tsc_start(&t); #pragma omp parallel for reduction(+:sum2) for (size_t i=0; i<nints; i++) { sum2 += sum_op(arr[i]); } tsc_pause(&t); tsc_report("sum_OMP", &t); if (sum1 != sum2) { fprintf(stderr, "Error in sum: %d vs %d\n", sum1, sum2); abort(); } printf("DONE\n"); return 0; }
static void *do_spmv_thread_main(void *arg) { spm_mt_thread_t *spm_mt_thread = arg; SPMV_NAME(_sym_fn_t) *spmv_mt_sym_fn = spm_mt_thread->spmv_fn; setaffinity_oncpu(spm_mt_thread->cpu); tsc_t tsc; tsc_init(&tsc); tsc_start(&tsc); // Switch Reduction Phase int i/*, j, start, end*/; /* start = 0; end = n / ncpus; */ for (i = 0; i < nloops; i++) { // Switch Reduction Phase. VECTOR_NAME(_init_from_map)(temp, 0, spm_mt_thread->map); pthread_barrier_wait(&barrier); spmv_mt_sym_fn(spm_mt_thread->spm, spm_mt_thread->data, y, y); pthread_barrier_wait(&barrier); // Switch Reduction Phase. /* for (j = 0; j < ncpus; j++) VECTOR_NAME(_add_part)(y, temp[j], y, start, end); */ VECTOR_NAME(_add_from_map)(y, temp, y, spm_mt_thread->map); pthread_barrier_wait(&barrier); } tsc_pause(&tsc); secs = tsc_getsecs(&tsc); tsc_shut(&tsc); return NULL; }
int main(int argc, const char *argv[]) { if (argc < 4) { fprintf(stderr, "Usage: %s <array_size> <block_size> <accesses>\n", argv[0]); exit(1); } unsigned int asize = atol(argv[1]); unsigned int bsize = atol(argv[2]); unsigned int accesses = atol(argv[3]); unsigned int seed = time(NULL); tsc_t tc; /* normal pointers */ srand(seed); printf("CoPy\n"); unsigned int *p, *p_copy; unsigned int sum_copy = 0; p = xmalloc(asize*sizeof(unsigned int)); for (unsigned int i=0; i<asize; i++) p[i] = i; tsc_init(&tc); tsc_start(&tc); p_copy = xmalloc(asize*sizeof(unsigned int)); memcpy(p_copy, p, asize*sizeof(unsigned int)); for (unsigned int j=0; j<accesses; j++) { unsigned int idx = rand() % asize; p_copy[idx] = 0; } #ifdef DO_SUMS for (unsigned int j=0; j<asize; j++) { sum_copy += p_copy[j]; } #endif tsc_pause(&tc); tsc_report(&tc); /* versioned pointers */ tsc_t t; srand(seed); printf("VerSions\n"); unsigned int sum_versions = 0; sla_t *sla = sla_init(10, .5, 16, time(NULL)); sla->def_nitems = bsize; for (unsigned int i=0; i<asize; i++) sla_append(sla, i); tsc_init(&t); tsc_start(&t); versla_t *versla = versla_init(sla); ver_t *v1 = versla_newver(versla, versla->vo.ver_base); for (unsigned int j=0; j<accesses; j++) { unsigned int idx = rand() % asize; versla_set(versla, idx, 0, v1); } #ifdef DO_SUMS for (unsigned int j=0; j<asize; j++) { unsigned int x = versla_get(versla, j, v1); sum_versions += x; } #endif tsc_pause(&t); tsc_report(&t); printf("\ntC/tV=%lf\n", (double)tsc_getticks(&tc)/(double)tsc_getticks(&t)); for (unsigned int j=0; j<asize; j++) { unsigned int x0 = p_copy[j]; unsigned int x1 = versla_get(versla, j, v1); if (x0 != x1) { fprintf(stderr, "copy:%d and versions:%d differ for j=%d\n", x0, x1, j); } } assert(sum_versions == sum_copy); return 0; }