int monitor(int pid, struct timeval *jointime) { int ret, status = -1; uint64_t tsc, ovhd, start, duration, t; struct rusage usage; int sense = 1; pthread_barrier_wait(&barrier); start = _rdtsc(); for (;;) { // We're not measuring the bootstrap core. Discourage users from // using the last core. //readcounters(0); t = _rdtsc(); printcounters(counters, t - start); if (!gbl.server) { if (waitpid(pid, &status, WNOHANG)) { gettimeofday(jointime, 0); done = 1; break; } } start = _rdtsc(); timeout(); } close_output(); return status; }
int main() { float A[512][512] __attribute__ ((aligned(16))); for (int i = 0 ; i < 512 ; i ++) { for (int j = 0 ; j < 512 ; j ++) { A[i][j] = 0.1/(i+j+1); } } unsigned long long start_c, end_c, diff_c; start_c = _rdtsc(); t3(A); end_c=_rdtsc(); diff_c = end_c - start_c; float giga_cycle = diff_c / 1000000000.0; float ret = 0.; int i = 0; for (int i = 0 ; i < 4 ; i ++) { for (int j = 0 ; j < 512 ; j += 2) { ret += A[i][j] - A[i][j+1]; } } printf("t3 took %f giga cycles and the result is: %f\n", giga_cycle, ret); }
int main(){ float* A = (float*) memalign(16, LEN6*sizeof(float)); float* D = (float*) memalign(16, LEN6*sizeof(float)); for (int i = 0 ; i < LEN6 ; i ++) { A[i] = (float)(i)/(float)LEN6; D[i] = (float)(i+3)/(float)LEN6; } unsigned long long start_c, end_c, diff_c; start_c = _rdtsc(); t6(A,D); end_c =_rdtsc(); diff_c = end_c - start_c; float giga_cycle = diff_c / 1000000000.0; float ttt = (float)0.; #pragma novector for (int i = 0 ; i < LEN6 ; i ++) { ttt += A[i]; } printf("t6 took\t %.2f and the result is %f\n", giga_cycle, ttt); }
int main(){ float* A = (float*) memalign(16, LEN5*sizeof(float)); float* B = (float*) memalign(16, LEN5*sizeof(float)); float* C = (float*) memalign(16, LEN5*sizeof(float)); float* D = (float*) memalign(16, LEN5*sizeof(float)); float* E = (float*) memalign(16, LEN5*sizeof(float)); for (int i = 0; i < LEN5; i++){ A[i] = (float)(i)/(float)LEN5; B[i] = (float)(i+1)/(float)LEN5; C[i] = (float)(i+2)/(float)LEN5; D[i] = (float)(i+3)/(float)LEN5; E[i] = (float)(i+4)/(float)LEN5; } unsigned long long start_c, end_c, diff_c; start_c = _rdtsc(); t5(A,B,C,D,E); end_c=_rdtsc(); diff_c = end_c - start_c; float giga_cycle = diff_c / 1000000000.0; float ttt = (float)0.; #pragma novector for (int i = 0; i < LEN5; i++) ttt += A[i]; printf("t5 took\t %.2f and the result is %f\n", giga_cycle, ttt); }
void printcounters(struct counter *ctrs, uint64_t duration) { struct metrics s = {0}; s.timestamp = _rdtsc(); s.duration = duration; // We skip the last core int corethreads =0; for (int cpu = 1; cpu < gbl.ncpus-3; ++cpu) { double delta[NEVENTS]; // volatile because another thread is changing it. volatile struct counter *p = &ctrs[cpu]; for (int i = 0; i < NEVENTS; ++i) { union { __m512d c; uint64_t values[8]; } t; t.c = _mm512_load_pd((void *)&p->counts[i][0]); delta[i] = perf_scale_delta(t.values, lastctr[cpu].counts[i]); _mm512_storenrngo_pd((void *)&lastctr[cpu].counts[i][0], t.c); if (delta[i] < 0) delta[i] = 0; sevents[i] += delta[i]; } if (2*delta[clocks1] > duration) { s.nthreads += 1; corethreads += 1; } if ((cpu % 4) == 0) // Last thread on this core { if (corethreads) s.ncores += 1; corethreads = 0; } s.vpu_ea += delta[vpu_ea]; s.instrs += delta[instrs]; s.vinstrs += delta[vpu_ie]; } uint64_t nreads = 0, nwrites = 0; for (int i = 0; i < NGBOXES; ++i) for (int j = 0; j < 2; ++j) { nreads += pmu_rdctr(i, j, 0); nwrites += pmu_rdctr(i, j, 1); } s.rbytes = (nreads - prevnreads) * 64; s.wbytes = (nwrites - prevnwrites)* 64; prevnreads = nreads; prevnwrites = nwrites; sample(&s); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_wtime //-------------------------------------------------------------------------------------- // clean up at end of library usage (memory unmapping) //-------------------------------------------------------------------------------------- double RCCE_wtime(void) { #ifdef SCC return ( ((double)_rdtsc())/(RC_REFCLOCKGHZ*1.e9)); #else return (omp_get_wtime()); #endif }
// Gets the raw frecuency of the cpu double* getFrequency(int precision, int tcks) { // Precision gives us the amount of times it'll be approximated. // Tcks gives us the amount of ticks to try to get the frecuency // 2 is the minimum value. int it = 0, n = precision, c = 0; unsigned long counter = 0; // We check limits... if (tcks > 18) tcks = 18; if (tcks < 2) tcks = 2; // Startup variables double cpuFreqs; cpuFreq = 0; // Iterate for elements. for (it = 0; it < n; it++) { ticks = 0; int oldticks = 1; // Waiting for a tick change helps us solve // Any redundancy in the numbers // "Syncing" the counter with the ticks is really helpful while (ticks != oldticks) ; counter = _rdtsc(); // Wait for another tick change, one is usually enough. while (ticks < tcks) ; counter = _rdtsc() - counter; // Normalizes to Mhz cpuFreqs = counter / ((ticks - 1) * 54925.40115); cpuFreq += cpuFreqs; if (!fix_flag) { fix_flag++; cpuFreq = 0; it--; } } // Average if needed. cpuFreq /= n; return &cpuFreq; }
/* * Returns the start and end RTC times for this busy loop. * Ideally, by examining the TSC and RTC times, we should be able to * identify their correlation. */ static uint64_t spin_loop(unsigned int count, unsigned int *rtc_start, unsigned int *rtc_end) { uint64_t start_tsc; retry: *rtc_start = QM_RTC[QM_RTC_0].rtc_ccvr; start_tsc = _rdtsc(); clk_sys_udelay(400); *rtc_end = QM_RTC[QM_RTC_0].rtc_ccvr; if ((*rtc_end < *rtc_start) && (!((*rtc_start & 0xF0000000) == 0xF0000000))) { goto retry; } return _rdtsc() - start_tsc; }
void printcounters(struct counter *ctrs, uint64_t duration) { struct metrics s = {0}; uint64_t thisBytesWritten = pcm->bytesWritten(); uint64_t thisBytesRead = pcm->bytesRead(); memset(threadspercore, 0, gbl.ncores * sizeof(int)); s.timestamp = _rdtsc(); s.duration = duration; for (int cpu = 0; cpu < gbl.ncpus; ++cpu) { double delta[NEVENTS]; // volatile because another thread is changing it. volatile struct counter *p = &ctrs[cpu]; for (int i = 0; i < NEVENTS; ++i) { union { __m256d c; uint64_t values[4]; } t; t.c = _mm256_load_pd((const double *)&p->counts[i][0]); delta[i] = perf_scale_delta(t.values, lastctr[cpu].counts[i]); _mm256_store_pd((double *)&lastctr[cpu].counts[i][0], t.c); if (delta[i] < 0) delta[i] = 0; sevents[i] += delta[i]; } //printf("clocks %g duration %lu\n", delta[clocks], duration); if (2*delta[clocks] > duration) { int thiscore = pcm->getSocketId(cpu) * gbl.corespersocket + pcm->getCoreId(cpu); ++s.nthreads; ++threadspercore[thiscore]; } s.dsimd += delta[simd_dp]; s.dsse += delta[sse_dp]; s.dscalar += delta[scalar_dp]; s.ssimd += delta[simd_sp]; s.ssse += delta[sse_sp]; s.sscalar += delta[scalar_sp]; s.instrs += delta[instrs]; } s.rbytes = thisBytesRead - lastBytesRead; s.wbytes = thisBytesWritten - lastBytesWritten; lastBytesRead = thisBytesRead; lastBytesWritten = thisBytesWritten; for (int i = 0; i < gbl.ncores; ++i) if (threadspercore[i]) ++s.ncores; sample(&s); }
unsigned __int64* _perf_start(void) { unsigned __int64* stime; #pragma omp critical { stime = malloc(sizeof(*stime)); *stime = _rdtsc(); } return stime; }
void _perf_end(unsigned __int64 *stime, int index) { *stime = _rdtsc() - *stime; #pragma omp critical { perfsum[index] += (double)(*stime); perfcount[index] ++; } free(stime); }
static void rdtsc_calibrate(){ uint64 t1, t2; int32 i; ShowStatus("Calibrating Timer Source, please wait... "); RDTSC_CLOCK = 0; for(i = 0; i < 5; i++){ t1 = _rdtsc(); usleep(1000000); //1000 MS t2 = _rdtsc(); RDTSC_CLOCK += (t2 - t1) / 1000; } RDTSC_CLOCK /= 5; RDTSC_BEGINTICK = _rdtsc(); ShowMessage(" done. (Frequency: %u Mhz)\n", (uint32)(RDTSC_CLOCK/1000) ); }
void * driver0(void * arg) { int i,j,k, iter_count =0; uint64_t line_count=0, init_tsc, end_tsc; size_t * read_pntr; read_pntr = array; // pin core affinity if(pin_cpu(pid, cpu_read) == -1) { err(1,"cannot set cpu read affinity"); } else{ printf(" read thread pinned to core %d to run\n",cpu_read); } fprintf(stderr,"total_lines = %ld\n",total_lines); read_sum_tsc = 0; while(line_count < total_lines) { i = 0; while(exchange_flag == 0) { i++; } init_tsc = _rdtsc(); // if(iter_count < 10)fprintf(stderr,"reader calling kernel\n"); read_pntr = read_buf(seg_size, read_pntr); // if(iter_count < 10)fprintf(stderr,"reader returned from kernel\n"); end_tsc = _rdtsc(); read_sum_tsc += (end_tsc - init_tsc); line_count += seg_size; iter_count++; exchange_flag = 0; } fprintf(stderr," from read thread, line_count = %ld, TSC sum = %lu, latency = %g\n", line_count, read_sum_tsc,(double)read_sum_tsc/(double)line_count); pthread_exit(NULL); }
int main() { float* A = (float*) _mm_malloc(1024*sizeof(float), 16); float* B = (float*) _mm_malloc(1024*sizeof(float), 16); for (int i = 0 ; i < 1024 ; i ++){ A[i] = 1. / (i+1); B[i] = 2. / (i+1); } unsigned long long start_c, end_c, diff_c; start_c = _rdtsc(); t1(A,B); end_c =_rdtsc(); diff_c = end_c - start_c; float giga_cycle = diff_c / 1000000000.0; float ret = 0; for (int i = 0; i < 1024; i ++) { ret += A[i]; } printf("t1 took %f giga cycles and the result is: %f\n", giga_cycle, ret); }
void * reader(void * ev) { int tid = (int) ev; __int64 tsc; while (1) { // !done) { __int64 value; value = GET(); tsc = _rdtsc(); printf("%d: Got delta %f, size=%d\n",tid, ((double) (tsc - value)) / (double) 3000000000.0,GETSIZE()); usleep(read_sleep); } printf("Thread exiting\n"); return(NULL); }
RingBufferEntry* allocEntry(RingBufferType t) { ASSERT(Util::isPowerOfTwo(kMaxRBEntries)); RingBufferEntry* rb; int newRingPos, oldRingPos; do { oldRingPos = g_ringIdx; rb = &g_ring[oldRingPos]; newRingPos = (oldRingPos + 1) % kMaxRBEntries; } while (!atomic_cas(&g_ringIdx, oldRingPos, newRingPos)); rb->m_ts = uint32_t(_rdtsc()); rb->m_type = t; rb->m_threadId = (uint32_t)((int64)pthread_self() & 0xFFFFFFFF); return rb; }
unsigned int getnowtime() { #if defined PLATFORM_WINDOWS return GetTickCount(); #elif defined(ENABLE_RDTSC) return (unsigned int)((_rdtsc() - RDTSC_BEGINTICK) / RDTSC_CLOCK); #elif (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0 && defined(_POSIX_MONOTONIC_CLOCK) /* posix compliant */) || (defined(__FreeBSD_cc_version) && __FreeBSD_cc_version >= 500005 /* FreeBSD >= 5.1.0 */) struct timespec tval; clock_gettime(CLOCK_MONOTONIC, &tval); return tval.tv_sec * 1000 + tval.tv_nsec / 1000000; #else struct timeval tval; gettimeofday(&tval, NULL); return tval.tv_sec * 1000 + tval.tv_usec / 1000; #endif }
int64_t ox_getnowtime(void) { #if defined PLATFORM_WINDOWS int64_t second = time(NULL); SYSTEMTIME sys; GetLocalTime(&sys); return second*1000 + sys.wMilliseconds; #elif defined(ENABLE_RDTSC) return (unsigned int)((_rdtsc() - RDTSC_BEGINTICK) / RDTSC_CLOCK); #elif (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0 && defined(_POSIX_MONOTONIC_CLOCK) /* posix compliant */) || (defined(__FreeBSD_cc_version) && __FreeBSD_cc_version >= 500005 /* FreeBSD >= 5.1.0 */) struct timespec tval; clock_gettime(CLOCK_MONOTONIC, &tval); return tval.tv_sec * 1000 + tval.tv_nsec / 1000000; #else struct timeval tval; gettimeofday(&tval, NULL); return tval.tv_sec * 1000 + tval.tv_usec / 1000; #endif }
void * writer(void * ev) { int tid = (int) ev; int i,j; sleep (1); for (i = 0; i < iter; i++) { __int64 val; if ((rand() % sleep_frac) == 0) { usleep(write_sleep); } val = _rdtsc(); // rand(); printf("%d:%d Put size=%d\n",tid, i, GETSIZE()); PUT(val); } cond_begin; num_write --; if (num_write == 0) { cond_event_tm_signal(&done_cond); } cond_end; return(NULL); }
void setup_output() { char hdr[1024]; hdr[sizeof(hdr)-1] = 0; hdr[0] = 0; lastBytesRead = pcm->bytesRead(); lastBytesWritten = pcm->bytesWritten(); threadspercore = new int[gbl.ncores]; labels[fThreads].max = gbl.ncpus; labels[fInst].max = gbl.ncores * 4 * gbl.hz; labels[fFlopsSP].max = gbl.ncores * 8 * 2 * gbl.hz; labels[fFlopsDP].max = gbl.ncores * 4 * 2 * gbl.hz; hdrLabels[0] = "Time"; hdrLabels[1] = "Threads"; hdrIndexes[0] - fTime; hdrIndexes[1] = fThreads; int j = 2; for (int i = 0; i < nfields; ++i) if (i != fTime && i != fThreads) { hdrLabels[j] = labels[i].name; hdrIndexes[j] = i; ++j; } bool first = true; for (int i = 0; i < nfields; ++i) { if (first) first = false; else strncat(hdr, ",", sizeof(hdr)-1); strncat(hdr, hdrLabels[i], sizeof(hdr)-1); } strncat(hdr, "\n", sizeof(hdr)-1); if (gbl.server) { setup_server(); } else if (gbl.outfile) { outfile = fopen(gbl.outfile, "w"); if (outfile == NULL) err(1, "create %s", gbl.outfile); } gbl.hdr[0] = 0; gbl.hdr[sizeof(gbl.hdr)-1] = 0; if (gbl.server) { for (int i = 0; i < nfields; ++i) if (i != fTime) snprintf(gbl.hdr+strlen(gbl.hdr), sizeof(gbl.hdr)-1, "%s=%g,%s,%g\n", labels[i].name, labels[i].max, labels[i].units, labels[i].factor); strncat(gbl.hdr, hdr, sizeof(gbl.hdr)-1); } else if (outfile) { for (int i = 0; i < nfields; ++i) if (i != fTime) fprintf(outfile, "%s=%g,%s,%g\n", labels[i].name, labels[i].max, labels[i].units, labels[i].factor); fprintf(outfile, hdr); } starttsc = _rdtsc(); }
int main(int argc, char* argv[]) { double sTime, eTime; double sum_delta = 0.0; double sum_ref = 0.0; double max_delta = 0.0; double sumReserve = 0.0; printf("Monte Carlo European Option Pricing Single Precision\n\n"); printf("Compiler Version = %d\n", __INTEL_COMPILER/100); printf("Release Update = %d\n", __INTEL_COMPILER_UPDATE); printf("Build Time = %s %s\n", __DATE__, __TIME__); printf("Path Length = %d\n", RAND_N); printf("Number of Options = %d\n", OPT_N); printf("Block Size = %d\n", RAND_BLOCK_LENGTH); printf("Worker Threads = %d\n\n", NTHREADS); const int mem_size = sizeof(float)*OPT_PER_THREAD; #ifndef _OPENMP NTHREADS = 1; #endif float *samples[MAX_THREADS]; VSLStreamStatePtr Streams[MAX_THREADS]; const int nblocks = RAND_N/RAND_BLOCK_LENGTH; #pragma omp parallel reduction(+ : sum_delta) reduction(+ : sum_ref) reduction(+ : sumReserve) reduction(max : max_delta) { #ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; #endif unsigned int randseed = RANDSEED + threadID; srand(randseed); float *CallResultList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN); float *CallConfidenceList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN); float *StockPriceList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN); float *OptionStrikeList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN); float *OptionYearsList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN); for(int i = 0; i < OPT_PER_THREAD; i++) { CallResultList[i] = 0.0f; CallConfidenceList[i] = 0.0f; StockPriceList[i] = RandFloat_T(5.0f, 50.0f, &randseed); OptionStrikeList[i] = RandFloat_T(10.0f, 25.0f, &randseed); OptionYearsList[i] = RandFloat_T(1.0f, 5.0f, &randseed); } samples[threadID] = (float *)scalable_aligned_malloc(RAND_BLOCK_LENGTH * sizeof(float), SIMDALIGN); vslNewStream(&(Streams[threadID]), VSL_BRNG_MT2203 + threadID, RANDSEED); #pragma omp barrier if (threadID == 0) { printf("Starting options pricing...\n"); sTime = second(); start_cyc = _rdtsc(); } for(int opt = 0; opt < OPT_PER_THREAD; opt++) { const float VBySqrtT = VLog2E * sqrtf(OptionYearsList[opt]); const float MuByT = MuLog2E * OptionYearsList[opt]; const float Y = StockPriceList[opt]; const float Z = OptionStrikeList[opt]; float v0 = 0.0f; float v1 = 0.0f; for(int block = 0; block < nblocks; ++block) { float *rand = samples[threadID]; vsRngGaussian (VSL_RNG_METHOD_GAUSSIAN_ICDF, Streams[threadID], RAND_BLOCK_LENGTH, rand, MuByT, VBySqrtT); #pragma vector aligned #pragma simd reduction(+:v0) reduction(+:v1) #pragma unroll(4) for(int i=0; i < RAND_BLOCK_LENGTH; i++) { float callValue = Y * exp2f(rand[i]) - Z; callValue = (callValue > 0.0) ? callValue : 0.0; v0 += callValue; v1 += callValue * callValue; } } const float exprt = exp2f(RLog2E*OptionYearsList[opt]); CallResultList[opt] = exprt * v0 * INV_RAND_N; const float stdDev = sqrtf((F_RAND_N * v1 - v0 * v0) * STDDEV_DENOM); CallConfidenceList[opt] = (float)(exprt * stdDev * CONFIDENCE_DENOM); } //end of opt #pragma omp barrier if (threadID == 0) { end_cyc = _rdtsc(); eTime = second(); printf("Parallel simulation completed in %f seconds.\n", eTime-sTime); printf("Validating the result...\n"); } double delta = 0.0, ref = 0.0, L1norm = 0.0; int max_index = 0; double max_local = 0.0; for(int i = 0; i < OPT_PER_THREAD; i++) { double callReference, putReference; BlackScholesBodyCPU( callReference, putReference, StockPriceList[i], OptionStrikeList[i], OptionYearsList[i], RISKFREE, VOLATILITY ); ref = callReference; delta = fabs(callReference - CallResultList[i]); sum_delta += delta; sum_ref += fabs(ref); if(delta > 1e-6) sumReserve += CallConfidenceList[i] / delta; max_local = delta>max_local? delta: max_local; } max_delta = max_local>max_delta? max_local: max_delta; vslDeleteStream(&(Streams[threadID])); scalable_aligned_free(samples[threadID]); scalable_aligned_free(CallResultList); scalable_aligned_free(CallConfidenceList); scalable_aligned_free(StockPriceList); scalable_aligned_free(OptionStrikeList); scalable_aligned_free(OptionYearsList); }//end of parallel block sumReserve /= (double)OPT_N; const double L1norm = sum_delta / sum_ref; printf("L1_Norm = %4.3E\n", L1norm); printf("Average RESERVE = %4.3f\n", sumReserve); printf("Max Error = %4.3E\n", max_delta); const unsigned long long cyc = end_cyc - start_cyc; const double optcyc = (double)cyc/(double)OPT_N; printf("==========================================\n"); printf("Total Cycles = %lld\n", cyc); printf("Cyc/opt = %8.3f\n", optcyc); printf("Time Elapsed = %8.3f\n", eTime-sTime); printf("Options/sec = %8.3f\n", OPT_N/(eTime-sTime)); printf("==========================================\n"); return 0; }
int main(void){ /* stack buffers */ char sbuf1[SHORTBUF]; char sbuf2[LONGBUF]; /* heap buffers */ char *hbuf1 = NULL; char *hbuf2 = NULL; uint64_t cycles; int i; hbuf1 = (char *)malloc(SHORTBUF); hbuf2 = (char *)malloc(LONGBUF); if((!hbuf1) || (!hbuf2)){ fprintf(stderr, "malloc failed\n"); exit(EXIT_FAILURE); } //just for load libc addr bzero(sbuf1, SHORTBUF); /* ------ test short buffers ------ */ cycles = _rdtsc(); for(i = NSAMPLES; i > 0; i--){ bzero(sbuf1, SHORTBUF); } cycles = _rdtsc() - cycles; printf("[STACK] [%d] bzero: %" PRIu64 " cycles\n", SHORTBUF, cycles); cycles = _rdtsc(); for(i = NSAMPLES; i > 0; i--){ bzero(hbuf1, SHORTBUF); } cycles = _rdtsc() - cycles; printf("[HEAP] [%d] bzero: %" PRIu64 " cycles\n", SHORTBUF, cycles); cycles = _rdtsc(); for(i = NSAMPLES; i > 0; i--){ my_bzero(sbuf1, SHORTBUF); } cycles = _rdtsc() - cycles; printf("[STACK] [%d] my_bzero: %" PRIu64 " cycles\n", SHORTBUF, cycles); cycles = _rdtsc(); for(i = NSAMPLES; i > 0; i--){ my_bzero(hbuf1, SHORTBUF); } cycles = _rdtsc() - cycles; printf("[HEAP] [%d] my_bzero: %" PRIu64 " cycles\n", SHORTBUF, cycles); puts(""); /* ------ test long buffers ------ */ cycles = _rdtsc(); for(i = NSAMPLES; i > 0; i--){ bzero(sbuf2, LONGBUF); } cycles = _rdtsc() - cycles; printf("[STACK] [%d] bzero: %" PRIu64 " cycles\n", LONGBUF, cycles); cycles = _rdtsc(); for(i = NSAMPLES; i > 0; i--){ bzero(hbuf2, LONGBUF); } cycles = _rdtsc() - cycles; printf("[HEAP] [%d] bzero: %" PRIu64 " cycles\n", LONGBUF, cycles); cycles = _rdtsc(); for(i = NSAMPLES; i > 0; i--){ my_bzero(sbuf2, LONGBUF); } cycles = _rdtsc() - cycles; printf("[STACK] [%d] my_bzero: %" PRIu64 " cycles\n", LONGBUF, cycles); cycles = _rdtsc(); for(i = NSAMPLES; i > 0; i--){ my_bzero(hbuf2, LONGBUF); } cycles = _rdtsc() - cycles; printf("[HEAP] [%d] my_bzero: %" PRIu64 " cycles\n", LONGBUF, cycles); free(hbuf1); free(hbuf2); hbuf1 = NULL; hbuf2 = NULL; return 0; }
void main(int argc, char ** argv) { double *a, *b, *c, xx=0.01, bw, avg_bw, best_bw=-1.0; char * buf1, *buf2, *buf3; int i,j,k,offset_a=0,offset_b=0,offset_c=0, mult=1,iter=1000, c_val; int len,num_pages, num_lines, cpu_run,scale; u64 start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0; __pid_t pid=0; int cpu_setsize; cpu_set_t mask; int *buff; size_t buf_size; off_t offset = 0; int fd = -1; // process input arguments if(argc < 3 ){ printf("affinity needs 2 arguments, cpu_run, call count multiplier def = 1\n"); printf(" argc = %d\n",argc); usage(); err(1, "bad arguments"); } while ((c_val = getopt(argc, argv, "i:r:l:m:a:b:c")) != -1) { switch(c_val) { case 'r': cpu_run = atoi(optarg); break; case 'm': mult = atoi(optarg); break; default: err(1, "unknown option %c", c_val); } } // pin core affinity for initialization if(pin_cpu(pid, cpu_run) == -1) { err(1,"failed to set affinity"); } else{ fprintf(stderr," process pinned to core %d for triad run\n",cpu_run); } // set buffer sizes and loop tripcount buf_size = (u64)4096*(u64)num_pages; num_lines=64*num_pages; iter = iter*mult; // malloc and initialize buffers printf(" starting malloc loop of %d iterations with buf_size = %ld, num_lines = %d\n",iter,buf_size, num_lines); call_start = _rdtsc(); for(i=0;i<iter;i++){ start = _rdtsc(); if(pin_cpu(pid, cpu_run) == -1) { err(1,"failed to set affinity"); } else{ fprintf(stderr," process pinned to core %d for triad run\n",cpu_run); } stop = _rdtsc(); run_time = stop - start; } call_stop = _rdtsc(); call_run_time = call_stop - call_start; // printout printf(" allocating %lld bytes and initializing and freeing took %lld cycles\n",(u64)len*(u64)iter,run_time); }
int main(int argc, char *argv[]) { sdl_state SDLState = {}; platform_work_queue HighPriorityQueue = {}; SDLMakeQueue(&HighPriorityQueue, 6); platform_work_queue LowPriorityQueue = {}; SDLMakeQueue(&LowPriorityQueue, 2); GlobalPerfCountFrequency = SDL_GetPerformanceFrequency(); SDLGetEXEFileName(&SDLState); char SourceGameCodeDLLFullpath[SDL_STATE_FILE_NAME_COUNT]; SDLBuildEXEPathFileName(&SDLState, "handmade.dylib", sizeof(SourceGameCodeDLLFullpath), SourceGameCodeDLLFullpath); char TempGameCodeDLLFullpath[SDL_STATE_FILE_NAME_COUNT]; SDLBuildEXEPathFileName(&SDLState, "handmade_temp.dylib", sizeof(TempGameCodeDLLFullpath), TempGameCodeDLLFullpath); char GameCodeLockFullpath[SDL_STATE_FILE_NAME_COUNT]; SDLBuildEXEPathFileName(&SDLState, "lock.tmp", sizeof(GameCodeLockFullpath), GameCodeLockFullpath); if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO) != 0) { printf("Failed to initialize SDL: %s\n", SDL_GetError()); return -1; } SDL_Window *Window = SDL_CreateWindow("Handmade Hero", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, 960, 540, SDL_WINDOW_OPENGL); if (!Window) { printf("Failed to create window: %s\n", SDL_GetError()); return -1; } SDLResizeDIBSection(Window, &GlobalBackBuffer, 960, 540); // TODO: Set GameUpdateHz by monitor refresh HZ real32 GameUpdateHz = 60.0f; real32 TargetSecondsPerFrame = 1.0f / GameUpdateHz; sdl_sound_output SoundOutput = {}; SoundOutput.SamplesPerSecond = 48000; SoundOutput.BytesPerSample = sizeof(int16) * 2; SoundOutput.BufferSize = SoundOutput.SamplesPerSecond * SoundOutput.BytesPerSample; SDL_AudioDeviceID Audio = SDLInitSound(SoundOutput.SamplesPerSecond); u32 MaxPossibleOverrun = 2 * 4 * sizeof(u16); int16 *Samples = (int16 *)mmap(0, (size_t)(SoundOutput.BufferSize + MaxPossibleOverrun), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); GlobalRunning = true; #if HANDMADE_INTERNAL void *BaseAddress = (void *)Terabytes(2); #else void *BaseAddress = 0; #endif game_memory GameMemory = {}; GameMemory.PermanentStorageSize = Megabytes(64); GameMemory.TransientStorageSize = Gigabytes(256); GameMemory.HighPriorityQueue = &HighPriorityQueue; GameMemory.LowPriorityQueue = &LowPriorityQueue; GameMemory.PlatformAPI.AddEntry = SDLAddEntry; GameMemory.PlatformAPI.CompleteAllWork = SDLCompleteAllWork; GameMemory.PlatformAPI.GetAllFilesOfTypeBegin = SDLGetAllFilesOfTypeBegin; GameMemory.PlatformAPI.GetAllFilesOfTypeEnd = SDLGetAllFilesOfTypeEnd; GameMemory.PlatformAPI.OpenNextFile = SDLOpenNextFile; GameMemory.PlatformAPI.ReadDataFromFile = SDLReadDataFromFile; GameMemory.PlatformAPI.FileError = SDLFileError; GameMemory.PlatformAPI.AllocateMemory = SDLAllocateMemory; GameMemory.PlatformAPI.DeallocateMemory = SDLDeallocateMemory; GameMemory.PlatformAPI.DEBUGFreeFileMemory = DEBUGPlatformFreeFileMemory; GameMemory.PlatformAPI.DEBUGReadEntireFile = DEBUGPlatformReadEntireFile; GameMemory.PlatformAPI.DEBUGWriteEntireFile = DEBUGPlatformWriteEntireFile; SDLState.TotalSize = GameMemory.PermanentStorageSize + GameMemory.TransientStorageSize; SDLState.GameMemoryBlock = mmap(BaseAddress, (size_t) SDLState.TotalSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); GameMemory.PermanentStorage = SDLState.GameMemoryBlock; GameMemory.TransientStorage = ((uint8 *) GameMemory.PermanentStorage + GameMemory.PermanentStorageSize); if (!(GameMemory.PermanentStorage && GameMemory.TransientStorage)) { printf("Failed to allocate game memory\n"); return -1; } // TODO: Add game replay support here game_input Input[2] = {}; game_input *NewInput = &Input[0]; game_input *OldInput = &Input[1]; uint64 LastCounter = SDL_GetPerformanceCounter(); uint64 FlipWallClock = SDL_GetPerformanceCounter(); sdl_game_code Game = SDLLoadGameCode(SourceGameCodeDLLFullpath, TempGameCodeDLLFullpath, GameCodeLockFullpath); uint64 LastCycleCount = _rdtsc(); while (GlobalRunning) { NewInput->dtForFrame = TargetSecondsPerFrame; NewInput->ExecutableReloaded = false; time_t NewDLLWriteTime = SDLGetLastWriteTime(SourceGameCodeDLLFullpath); if (difftime(NewDLLWriteTime, Game.DLLLastWriteTime) > 0) { SDLCompleteAllWork(&HighPriorityQueue); SDLCompleteAllWork(&LowPriorityQueue); SDLUnloadGameCode(&Game); Game = SDLLoadGameCode(SourceGameCodeDLLFullpath, TempGameCodeDLLFullpath, GameCodeLockFullpath); NewInput->ExecutableReloaded = true; } game_controller_input *OldKeyboardController = GetController(OldInput, 0); game_controller_input *NewKeyboardController = GetController(NewInput, 0); *NewKeyboardController = {}; NewKeyboardController->IsConnected = true; for (size_t ButtonIndex = 0; ButtonIndex < ArrayCount(NewKeyboardController->Buttons); ++ButtonIndex) { NewKeyboardController->Buttons[ButtonIndex].EndedDown = OldKeyboardController->Buttons[ButtonIndex].EndedDown; } SDLProcessPendingMessage(&SDLState, NewKeyboardController); if (!GlobalPause) { Uint32 MouseButtons = SDL_GetMouseState(&NewInput->MouseX, &NewInput->MouseY); NewInput->MouseZ = 0; SDLProcessKeyboardMessage(&NewInput->MouseButtons[0], SDL_BUTTON(SDL_BUTTON_LEFT)); SDLProcessKeyboardMessage(&NewInput->MouseButtons[1], SDL_BUTTON(SDL_BUTTON_MIDDLE)); SDLProcessKeyboardMessage(&NewInput->MouseButtons[2], SDL_BUTTON(SDL_BUTTON_RIGHT)); SDLProcessKeyboardMessage(&NewInput->MouseButtons[3], SDL_BUTTON(SDL_BUTTON_X1)); SDLProcessKeyboardMessage(&NewInput->MouseButtons[4], SDL_BUTTON(SDL_BUTTON_X2)); // TODO: Handle Mouse button here // TODO: Game controller support here game_offscreen_buffer Buffer = {}; Buffer.Memory = GlobalBackBuffer.Memory; Buffer.Width = GlobalBackBuffer.Width; Buffer.Height = GlobalBackBuffer.Height; Buffer.Pitch = GlobalBackBuffer.Pitch; if (Game.UpdateAndRender) { Game.UpdateAndRender(&GameMemory, NewInput, &Buffer); HandleDebugCycleCounters(&GameMemory); } // TODO: Game audio support here game_sound_output_buffer SoundBuffer = {}; SoundBuffer.SamplesPerSecond = SoundOutput.SamplesPerSecond; SoundBuffer.SampleCount = Align8((u32)(SoundOutput.SamplesPerSecond * TargetSecondsPerFrame)); SoundBuffer.Samples = Samples; if (Game.GetSoundSamples) { Game.GetSoundSamples(&GameMemory, &SoundBuffer); SDL_QueueAudio(Audio, SoundBuffer.Samples, SoundBuffer.SampleCount * SoundOutput.BytesPerSample); } SDLDisplayBufferInWindow(&GlobalBackBuffer); game_input *Temp = NewInput; NewInput = OldInput; OldInput = Temp; } } return 0; }
int main(void) { uint64 PerfCountFrequency = SDL_GetPerformanceFrequency(); SDL_Event Event; SDL_Window *Window; SDL_Renderer *Renderer; if(SDL_Init(SDL_INIT_VIDEO | SDL_INIT_GAMECONTROLLER | SDL_INIT_HAPTIC) != 0) { fprintf(stderr, "Could not initialize SDL: %s\n", SDL_GetError()); return -1; } atexit(SDL_Quit); int WindowWidth = 1300; int WindowHeight = 870; int BytesPerPixel = 4; Window = SDL_CreateWindow("Echelon", 0, 0, WindowWidth, WindowHeight, SDL_WINDOW_RESIZABLE); if(Window) { Renderer = SDL_CreateRenderer(Window, -1, 0); if(Renderer) { GlobalRunning = true; window_dimensions Dimensions = SDLGetWindowDimensions(Window); SDLCreateNewTexture(&GlobalBuffer, Renderer, Dimensions.Width, Dimensions.Height, BytesPerPixel); uint64 LastCounter = SDL_GetPerformanceCounter(); uint64 LastCycleCount = _rdtsc(); real64 DebugTimer = 0; real64 FPSTimer = 0; real64 UpdateTimer = 0; uint32 FPS = 0; uint32 UPS = 0; keyboard_input KeyboardInput = {}; gamepad_input GamePadInput = {}; game_code Game = LoadGameCode(); game_memory GameMemory = {}; GameMemory.IsInitialized = false; GameMemory.PlayRumble = SDLPlayRumble; GameMemory.WindowDimensions = SDLGetWindowDimensions(Window); GameMemory.PlatformDrawRenderQueue = DrawRenderQueueStub; GameMemory.PermanentStorageSize = Megabytes(100); GameMemory.PermanentStorage = mmap(0, GameMemory.PermanentStorageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); GameMemory.TransientStorageSize = Gigabytes(2); GameMemory.TransientStorage = mmap(0, GameMemory.TransientStorageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); Game.GameInit(&GameMemory); while(GlobalRunning) { // NOTE(Redab): This needs to while loop because we need // to handle events as long as they are available. while(SDL_PollEvent(&Event)) { SDLHandleEvent(&Event, &Dimensions); SDLHandleUserInput(&Event, &Game, &GameMemory, &KeyboardInput, &GamePadInput); } uint64 EndCycleCount = _rdtsc(); uint64 EndCounter = SDL_GetPerformanceCounter(); uint64 CounterElapsed = EndCounter - LastCounter; uint64 CyclesElapsed = EndCycleCount - LastCycleCount; // NOTE(Redab): CounterElapsed Contains the number of // clock cycles since last check. So we need to divide // this by the number of cycles per second which we // have in PerCountFrequency. Multiplied by 1000 to // get milliseconds. real64 SecondsPerFrame = ((real64)CounterElapsed / (real64)PerfCountFrequency); real64 MSPerFrame = SecondsPerFrame * 1000.0f; real64 KCPF = ((real64)CyclesElapsed / (1000.0f)); FPSTimer += MSPerFrame; UpdateTimer += MSPerFrame; DebugTimer += MSPerFrame; if(UpdateTimer >= (1000.0f / 60.0f)) { GameMemory.WindowDimensions = Dimensions; Game.GameUpdate(&GameMemory, &KeyboardInput, &GamePadInput, UpdateTimer / 1000.0f); UPS++; UpdateTimer = 0; } if(FPSTimer >= (1000.0f / 60.0f)) { SDLGameRender(&GlobalBuffer, &Game, &GameMemory); SDLBlitFrameToWindow(&GlobalBuffer, Renderer); FPS++; FPSTimer = 0; } if(DebugTimer >= 1000.0f) { printf("%.05fms/f, FPS: %d, UPS: %d, %.02fKc/f, Timer: %.02f\n", MSPerFrame, FPS, UPS, KCPF, DebugTimer); FPS = 0; UPS = 0; DebugTimer = 0; } LastCycleCount = EndCycleCount; LastCounter = EndCounter; } } else { printf("Failed to create SDL_Renderer: %s\n", SDL_GetError()); } } else { printf("Failed to create SDL_Window: %s\n", SDL_GetError()); } SDL_CloseAudio(); SDL_Quit(); return 0; }
void setup_output() { char hdr[1024]; hdr[sizeof(hdr)-1] = 0; pmu_init(); pmu_start(); for (int i = 0; i < NGBOXES; ++i) for (int j = 0; j < 2; ++j) { prevnreads += pmu_rdctr(i, j, 0); prevnwrites += pmu_rdctr(i, j, 1); } double ibw = (gbl.ncores-1) * 2 * gbl.hz; labels[fInst].max = ibw; labels[fVPU].max = ibw; double vop = (gbl.ncores-1) * 8 * gbl.hz; labels[fVpuSP].max = 2*vop; labels[fVpuDP].max = vop; // order doesn't really matter but we're used to this and it's better // for Excel hdrLabels[0] = "Time"; hdrLabels[1] = "Threads"; hdrIndexes[0] = fTime; hdrIndexes[1] = fThreads; int j = 2; for (int i = 0; i < nfields; ++i) if (i != fTime && i != fThreads) { hdrLabels[j] = labels[i].name; hdrIndexes[j] = i; ++j; } hdr[0] = 0; hdr[sizeof(hdr)-1] = 0; bool first = true; for (int i = 0; i < nfields; ++i) { if (first) first = false; else strncat(hdr, ",", sizeof(hdr)-1); strncat(hdr, hdrLabels[i], sizeof(hdr)-1); } strncat(hdr, "\n", sizeof(hdr)-1); if (gbl.server) { setup_server(); } else if (gbl.outfile) { outfile = fopen(gbl.outfile, "w"); if (outfile == NULL) err(1, "create %s", gbl.outfile); } gbl.hdr[0] = 0; gbl.hdr[sizeof(gbl.hdr)-1] = 0; if (gbl.server) { for (int i = 0; i < nfields; ++i) if (i != fTime) snprintf(gbl.hdr+strlen(gbl.hdr), sizeof(gbl.hdr)-1, "%s=%g,%s,%g\n", labels[i].name, labels[i].max, labels[i].units, labels[i].factor); strncat(gbl.hdr, hdr, sizeof(gbl.hdr)-1); } else if (outfile) { for (int i = 0; i < nfields; ++i) if (i != fTime) fprintf(outfile, "%s=%g,%s,%g\n", labels[i].name, labels[i].max, labels[i].units, labels[i].factor); fprintf(outfile, hdr); } starttsc = _rdtsc(); }
} return; } static void diffusion_mic(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz, REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count) { unsigned long (*pmc1)[2], (*pmc2)[2]; unsigned long pmcs[2]; unsigned long tsc; int nthreads; tsc = _rdtsc(); #pragma omp parallel { REAL *f1_t = f1; REAL *f2_t = f2; int mythread; #if defined(PMU) #pragma omp master { nthreads = omp_get_num_threads(); #if defined(PMU) pmc1 = malloc(nthreads * sizeof(pmc1[0])); pmc2 = malloc(nthreads * sizeof(pmc1[0])); #endif
void main(int argc, char ** argv) { double *a, *b, *c, xx=0.01, bw, avg_bw, best_bw=-1.0; char * buf1, *buf2, *buf3; int i,j,k,offset_a=0,offset_b=0,offset_c=0, mult=1,iter=100, c_val; int len,mem_level, level_size[4], cpu, cpu_run, bytes_per,scale; unsigned long long start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0; __pid_t pid=0; int cpu_setsize; cpu_set_t mask; // process input arguments if(argc < 3 ){ printf("triad driver needs at least 3 arguments, cpu_init, cpu_run, cache_level, [call count multiplier def = 1], [offset a, offset_b, offset_c defaults = 0] \n"); printf(" argc = %d\n",argc); usage(); err(1, "bad arguments"); } len = L4; while ((c_val = getopt(argc, argv, "i:r:l:m:a:b:c")) != -1) { switch(c_val) { case 'i': cpu = atoi(optarg); break; case 'r': cpu_run = atoi(optarg); break; case 'l': mem_level = atoi(optarg); break; case 'm': mult = atoi(optarg); break; case 'a': offset_a = atoi(optarg); break; case 'b': offset_b = atoi(optarg); break; case 'c': offset_c = atoi(optarg); break; default: err(1, "unknown option %c", c_val); } } iter = iter*mult; // pin core affinity for initialization if(pin_cpu(pid, cpu) == -1) { err(1,"failed to set affinity"); } else{ fprintf(stderr," process pinned to core %d for initialization\n",cpu); } // set buffer sizes and loop tripcounts based on memory level level_size[0]=L1; level_size[1]=L2; level_size[2]=L3; level_size[3]=L4; fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult); len = level_size[mem_level]/32; fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult); scale = level_size[3]/(32*len); fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d, scale = %d\n",len, mem_level, iter,mult,scale); iter =iter*scale*mult; fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult); // malloc and initialize buffers buf1 = malloc(sizeof(double)*len + 4096 + 1024); fprintf(stderr," buf1 = %p\n",buf1); buf1 = buf1 + (0x1000 - (unsigned int)buf1 & 0xFFF) + offset_a; fprintf(stderr," buf1 = %p\n",buf1); a = (double *) buf1; buf2 = malloc(sizeof(double)*len + 4096 + 1024); fprintf(stderr," buf2 = %p\n",buf2); buf2 = buf2 + (0x1000 - (unsigned int)buf2 & 0xFFF) + offset_b; fprintf(stderr," buf2 = %p\n",buf2); b = (double *) buf2; buf3 = malloc(sizeof(double)*len + 4096 + 1024); fprintf(stderr," buf3 = %p\n",buf3); buf3 = buf3 + (0x1000 - (unsigned int)buf3 & 0xFFF) + offset_c; fprintf(stderr," buf3 = %p\n",buf3); c = (double *) buf3; for(i=0;i<len;i++){ a[i] = 0.; b[i] = 10.; c[i] = 10.; } // pin core affinity for triad run if(pin_cpu(pid, cpu_run) == -1) { err(1,"failed to set affinity"); } else{ fprintf(stderr," process pinned to core %d for triad run\n",cpu_run); } // run the triad printf(" calling triad %d times with len = %d\n",iter,len); call_start = _rdtsc(); for(i=0;i<iter;i++){ start = _rdtsc(); bytes_per = triad(len,xx,a,b,c); stop = _rdtsc(); run_time = stop - start; xx+=0.01; total_bytes +=len*bytes_per; bw=(double)(len*bytes_per)/(double)run_time; if(bw > best_bw) best_bw = bw; } call_stop = _rdtsc(); call_run_time = call_stop - call_start; avg_bw=(double)(total_bytes)/(double)call_run_time; // printout printf(" transfering %lld bytes from memory level %d took %lld cycles/call and a total of %lld\n",total_bytes,mem_level,run_time,call_run_time); printf(" average bytes/cycle = %f\n", avg_bw); printf(" best bytes/cycle = %f\n",best_bw); }
//******************************************************* MAIN ******************************* int main(int argc, char* argv[]){ //set up sdl SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO); SDL_Window *Window = SDL_CreateWindow("Handmade Hero", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, 640, 480, SDL_WINDOW_RESIZABLE); if(Window){ SDL_Renderer *Renderer = SDL_CreateRenderer(Window, -1, 0); if(Renderer){ // VIDEO sdl_offscreen_buffer Buffer = {}; // if non-initialized, declaring variables in a loop fails on SDLResizeTexture with a pointer error - im blaming the compiler sdl_window_dimension Dimension = SDLGetWindowDimension(Window); SDLResizeTexture(&Buffer, Renderer, Dimension.Width, Dimension.Height); keystates Keys = {}; //AUDIO sdl_sound_output sound_output = sdl_sound_outputH(48000); //open audio SDLInitAudio(sound_output.SamplesPerSecond, sound_output.SamplesPerSecond * sound_output.BytesPerSample / 60); SDL_PauseAudio(0); //BW: state area allocation void *new_state_area = malloc(128*1024*1024);//need ~2 MB at least; give it 128 MiB -- 2MB for video; dont know audio void *prev_state_area = malloc(1024);// should be sizeof(state0), or sizeof(biggest statetype) later //apparently we didnt have enough memory, but only crashed sometimes? this fixed it void *state; { uint8 *next_ptr = (uint8*)prev_state_area; anim_comp *animation = (anim_comp*)next_ptr; next_ptr += sizeof(anim_comp); init_anim_comp(animation, 0,0); state0 *stateptr = (state0*)next_ptr; printf("state0 size %ld\n", sizeof(state0)); uint64 stateptrn = (uint64)stateptr; uint64 sizeptr = (uint64)&(stateptr->size); uint64 deepc_ptr = (uint64)&(stateptr->deep_count); uint64 anim_ptr = (uint64)&(stateptr->animation); uint64 tsine_ptr = (uint64)&(stateptr->tSine); uint64 tvol_ptr = (uint64)&(stateptr->ToneVolume); uint64 thz_ptr = (uint64)&(stateptr->ToneHz); uint64 pu_ptr = (uint64)&(stateptr->pitch_up_was_pressed); printf("offset begin %lu\n", sizeptr - stateptrn); printf("width of size %lu\n", deepc_ptr - sizeptr); printf("width of deepc %lu\n", anim_ptr - deepc_ptr); printf("width of animpt %lu\n", tsine_ptr - anim_ptr); printf("width of tsine %lu\n", tvol_ptr - tsine_ptr); printf("width of tvol %lu\n", thz_ptr - tvol_ptr); printf("width of thz %lu\n", pu_ptr - thz_ptr); next_ptr += sizeof(state0); init_state0(stateptr, animation, 3000, 256, 0); state = stateptr; //return 0; } uint64 LastCounter = SDL_GetPerformanceCounter(); uint64 LastCycleCount = _rdtsc(); uint64 PerfCountFrequency = SDL_GetPerformanceFrequency(); bool running = true; //main loop printf("enter main event loop\n"); while(running){ ///////NP_UPDATE///////////// //event capturing event_return events = eventHandler(&Keys); if(events.shouldQuit) running = false; //setup for p state_window_info Wi = {}; Wi.Height = Buffer.Height; Wi.Width = Buffer.Width; Wi.Pitch = Buffer.Pitch; int TargetQueueBytes = sound_output.LatencySampleCount * sound_output.BytesPerSample; state_sound_info Si = {}; Si.BytesToGet = TargetQueueBytes - SDL_GetQueuedAudioSize(1); Si.BytesPerSample = sound_output.BytesPerSample; Si.SamplesPerSecond = sound_output.SamplesPerSecond; uint64 state_size = 0; uint64 vbuffer_size = Buffer.Height * Buffer.Pitch; uint64 abuffer_size = Si.BytesToGet; state_return next; { //in case(statetype) or similar next = P_update_state0(*(state0*)state, new_state_area, Keys, Wi, Si); //p should return state_size? and also, what statetype we are in -> later state_size = sizeof(state0); } //GARBAGE COLLECTOR //move this state to previous state //fmemcpy(prev_state_area, new_state_area, state_size); //shallow copy, as supposed //printf("hi\n"); deepcopy(prev_state_area, next.state, new_state_area, (uint8*)new_state_area + 128*1024*1024); //TODO(md): DEEPCPY //queue audio if (Si.BytesToGet > 0) SDL_QueueAudio(1, next.abuffer, abuffer_size); //render SDLUpdateWindow(Window, Renderer, Buffer, next.vbuffer); uint64 EndCycleCount = _rdtsc(); uint64 EndCounter = SDL_GetPerformanceCounter(); uint64 CounterElapsed = EndCounter - LastCounter; uint64 CyclesElapsed = EndCycleCount - LastCycleCount; real64 MSPerFrame = (((1000.0f * (real64)CounterElapsed) / (real64)PerfCountFrequency)); real64 FPS = (real64)PerfCountFrequency / (real64)CounterElapsed; real64 MCPF = ((real64)CyclesElapsed / (1000.0f * 1000.0f)); printf("%.02fms/f, %.02f/s, %.02fmc/f\n", MSPerFrame, FPS, MCPF); LastCycleCount = EndCycleCount; LastCounter = EndCounter; } } } //if(Renderer, Window) SDL_Quit(); return 0; }
int main(int argc, char ** argv) { char * buf1; void * ret; size_t * array, ret_val = 0; size_t array_stride; int i,j,k,cpu,cpu_run,line_count,stride, fd = -1; off_t offset = 0; int len=10240000, iter=100,mult=1,main_ret=0; double iterations; double *a, *b; size_t start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0; __pid_t pid=0; size_t buf_size,jj,zero_loop, buf_by_num_seg,ind; size_t num_pages, page_size, var_size; int cpu_setsize; cpu_set_t mask; // size_t pattern[] = {4,1,5,2,6,3,7,0}; int *pattern; int step, c; int* index, lc_by_num_seg,count, num_seg=32, huge=0; unsigned int bitmask, *intstar; page_size = 4096; // process input arguments if(argc < 6){ fprintf(stderr,"the random walker requires at least 6 arguments (only the 7th in the list below is optional), there were %d\n",argc); usage(); err(1,"insufficient invocation arguments"); } while ((c = getopt(argc, argv, "i:r:l:s:S:m:L")) != -1) { switch(c) { case 'i': cpu = atoi(optarg); break; case 'r': cpu_run = atoi(optarg); break; case 'l': line_count = atoi(optarg); break; case 's': stride = atoi(optarg); break; case 'S': num_seg = atoi(optarg); break; case 'm': mult = atoi(optarg); break; case 'L': huge=1; page_size = 2 * 1024 * 1024; break; default: err(1, "unknown option %c", c); } } iter = iter*mult; var_size = sizeof(size_t); fprintf(stderr, "size_t in %zd bytes\n",var_size); // pin core affinity if(pin_cpu(pid, cpu) == -1) { err(1,"failed to set affinity"); } else{ fprintf(stderr," process pinned to core %d\n",cpu); } pattern = (int*) malloc(num_seg*sizeof(int)); if(pattern == NULL) { fprintf(stderr," failed to malloc pattern for size = %d\n",num_seg); err(1,"malloc of pattern failed"); } // calculate stride and buffer size stride = page_size*stride + 64; buf_size = (size_t)line_count*(size_t)stride; num_pages = buf_size/page_size + 2; buf_size = page_size*num_pages; array_stride = stride/sizeof(double); iterations = (double)iter*(double)len; // create index array for "random" patterna index = (int*)malloc(line_count*sizeof(int)); if(index == NULL) { fprintf(stderr," failed to malloc index array for line_count of %d\n",line_count); err(1,"failed to malloc index"); } if(num_seg == 1) { for(i=0; i<line_count-1; i++)index[i] = i; } else { // fprintf(stderr," calling rndm_list, n = %d\n",num_seg); rndm_list(pattern,num_seg); lc_by_num_seg = line_count/num_seg; if(lc_by_num_seg*num_seg != line_count) { fprintf(stderr," line count must be a multiple of the fifth argument num_seg = %d\n", num_seg); err(1," bad line_count"); } count=0; buf_by_num_seg = buf_size/num_seg; for(i=0; i<lc_by_num_seg; i++) { step = 0; for(j=0;j<num_seg;j++) { count++; if(j == (num_seg-1) ) step = 1; ind = lc_by_num_seg*pattern[j]; index[count]= (int) ind + i + step; if(index[count] >= line_count) printf(" count = %d, index = %d\n",count,index[count]); } } } index[0] = 0; for(i=0; i<line_count; i++)index[i] = index[i]*array_stride; // malloc and initialize buffers // replace malloc call with a call to mmap if(huge == 0) buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON , fd, offset); if(huge == 1) buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB , fd, offset); if(buf1 == MAP_FAILED) { fprintf(stderr,"mmap failed\n"); err(1,"mmap failed"); } fprintf(stderr," buf1 for a = %p\n",buf1); a = (double*) buf1; if(huge == 0) buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON , fd, offset); if(huge == 1) buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB , fd, offset); if(buf1 == MAP_FAILED) { fprintf(stderr,"mmap failed\n"); err(1,"mmap failed"); } fprintf(stderr," buf1 for b = %p\n",buf1); b = (double*)buf1; zero_loop = buf_size/sizeof(double); fprintf(stderr, " buf_size = %zu, zero_loop = %zu, array_stride = %zd\n",buf_size,zero_loop,array_stride); for(i=0; i<zero_loop; i++) a[i] = 0; for(i=0; i<zero_loop; i++) b[i] = 0; fprintf(stderr," finished zeroing buf for a, b\n"); // pin core affinity if(pin_cpu(pid, cpu_run) == -1) { err(1,"cannot set cpu run affinity"); } else{ printf(" process pinned to core %d to run\n",cpu_run); } // run the walker printf(" calling walker %d times which loops %d times on buffer of %d lines with a stride of %d, for a total size of %zu\n",iter,len,line_count,stride,buf_size); call_start = _rdtsc(); for(i=0;i<iter;i++){ start = _rdtsc(); ret_val = reader(len,line_count,a,b,index); // fprintf(stderr, " retval = %ld\n",ret_val); stop = _rdtsc(); run_time = stop - start; } call_stop = _rdtsc(); call_run_time = call_stop - call_start; printf(" run time = %zd\n",call_run_time); // printout printf(" average cycles per iteration = %f\n", (double)call_run_time/iterations); return main_ret; }