void CTimer::Stop(void) { #ifdef _WIN32 QueryPerformanceCounter(&m_t1); #endif m_c1 = ippGetCpuClocks(); return; } // CTimer::Stop()
void CTimer::Start(void) { #ifdef _WIN32 QueryPerformanceCounter(&m_t0); #endif #ifdef linux m_t0 = clock(); #endif m_c0 = ippGetCpuClocks(); return; } // CTimer::Start()
static void measure_end(MeasureIt *m) { m->p_end = ippGetCpuClocks(); }
// START FUNC DECL int srt_uniform_I8( long long *X, /* input */ long long nR, long long *Y, /* output */ int shift, char *mode /* "asc" or "dsc" */ ) // STOP FUNC DECL { int status = 0; long long **t_cnt = NULL, *cnt = NULL, *bin_offset = NULL; int nT = 0; long long t[16]; t[0] = ippGetCpuClocks(); if ( shift < 0 ) { go_BYE(-1); } if ( X == NULL ) { go_BYE(-1); } if ( mode == NULL ) { go_BYE(-1); } if ( nR < 16*1048576 ) { fprintf(stderr, "Too few elements to sort \n"); go_BYE(-1); } if ( ( strcmp(mode, "asc") != 0 ) && ( strcmp(mode, "dsc") != 0 ) ) { go_BYE(-1); } nT = NUM_THREADS; long long block_size = nR / nT; cnt = malloc(nT * sizeof(long long)); return_if_malloc_failed(cnt); assign_const_I8(cnt, nT, 0); bin_offset = malloc(nT * sizeof(long long)); return_if_malloc_failed(bin_offset); t_cnt = malloc(nT * sizeof(long long *)); return_if_malloc_failed(t_cnt); for ( int i = 0; i < nT; i++ ) { t_cnt[i] = malloc(nT * sizeof(long long)); return_if_malloc_failed(t_cnt[i]); assign_const_I8(t_cnt[i], nT, 0); } t[1] = ippGetCpuClocks(); // Figure out how many elements in each bin cilkfor ( int tid = 0; tid < nT; tid++ ) { long long lb = tid * block_size; long long ub = lb + block_size; if ( tid == (nT-1) ) { ub = nR; } long long *l_cnt = t_cnt[tid]; for ( long long i = lb; i < ub; i++ ) { unsigned long long inval = (unsigned long long) X[i]; int bin = inval >> shift; if ( bin >= NUM_THREADS ) { status = -1; continue; } l_cnt[bin]++; } } t[2] = ippGetCpuClocks(); // Sum up over local counts for ( int i = 0; i < nT; i++ ) { long long tempsum = 0; for ( int j = 0; j < nT; j++ ) { tempsum += t_cnt[j][i]; } cnt[i] = tempsum; } // Quick debugging check long long chk_nX = 0; for ( int i = 0; i < nT; i++ ) { chk_nX += cnt[i]; } // Create cumulative counts bin_offset[0] = 0; for ( int i = 1; i < nT; i++ ) { bin_offset[i] = bin_offset[i-1] + cnt[i-1]; } /* for ( int i = 1; i < nT; i++ ) { fprintf(stderr, "%d --> %lld \n", i, cnt[i]); } */ t[3] = ippGetCpuClocks(); if ( chk_nX != nR ) { go_BYE(-1); } // Now let us get elements to their correct bins for ( int i = 0; i < nR; i++ ) { unsigned long long inval = (unsigned long long) X[i]; int bin = inval >> shift; // if ( bin >= NUM_THREADS ) { status = -1; continue; } Y[bin_offset[bin]] = X[i]; bin_offset[bin]++; } cBYE(status); t[4] = ippGetCpuClocks(); // Re-Create cumulative counts bin_offset[0] = 0; for ( int i = 1; i < nT; i++ ) { bin_offset[i] = bin_offset[i-1] + cnt[i-1]; } // ------------------------------------------------------------ // Now we can sort individual bins in parallel cilkfor ( int tid = 0; tid < nT; tid++ ) { long long *lY = (long long *)Y; lY += bin_offset[tid]; if ( cnt[tid] == 0 ) { continue; } if ( strcmp(mode, "asc") == 0 ) { qsort_asc_I8(lY, cnt[tid], sizeof(long long), NULL); } else if ( strcmp(mode, "dsc") == 0 ) { qsort_dsc_I8(lY, cnt[tid], sizeof(long long), NULL); } else { status = -1; continue; } } cBYE(status); t[5] = ippGetCpuClocks(); /* for ( int i = 1; i < 5; i++ ) { fprintf(stderr, "%d --> %lld \n", i, t[i] - t[i-1]); } */ // ------------------------------------------------------------ BYE: if ( t_cnt != NULL ) { for ( int i = 0; i < nT; i++ ) { free_if_non_null(t_cnt[i]); } free_if_non_null(t_cnt); } free_if_non_null(cnt); free_if_non_null(bin_offset); return(status); }
static void measure_start(MeasureIt *m) { m->p_start = ippGetCpuClocks(); }