Пример #1
0
int
monitor(int pid, struct timeval *jointime)
{
    int ret, status = -1;
    uint64_t tsc, ovhd, start, duration, t;
    struct rusage usage;
    int sense = 1;

    pthread_barrier_wait(&barrier);
    start = _rdtsc();
    for (;;)
    {
        // We're not measuring the bootstrap core. Discourage users from
        // using the last core.
        //readcounters(0);

        t = _rdtsc();
        printcounters(counters, t - start);
        if (!gbl.server)
        {
            if (waitpid(pid, &status, WNOHANG))
            {
                gettimeofday(jointime, 0);
                done = 1;
                break;
            }
        }
        start = _rdtsc();
        timeout();
    }
    close_output();
    return status;
}
Пример #2
0
int 
main() {
	float A[512][512] __attribute__ ((aligned(16)));
	for (int i = 0 ; i < 512 ; i ++) {
		for (int j = 0 ; j < 512 ; j ++) {
			A[i][j] = 0.1/(i+j+1);
		}
	}

	unsigned long long start_c, end_c, diff_c;
	start_c = _rdtsc();

	t3(A);

	end_c=_rdtsc();
	diff_c = end_c - start_c;
	float giga_cycle = diff_c / 1000000000.0;
	float ret = 0.;
	int i = 0;
	for (int i = 0 ; i < 4 ; i ++) {
		for (int j = 0 ; j < 512 ; j += 2) {
			ret += A[i][j] - A[i][j+1];
		}
	}
	printf("t3 took %f giga cycles and the result is: %f\n", giga_cycle, ret);
}
Пример #3
0
int main(){
   float* A = (float*) memalign(16, LEN6*sizeof(float));
   float* D = (float*) memalign(16, LEN6*sizeof(float));
      
   for (int i = 0 ; i < LEN6 ; i ++) {
      A[i] = (float)(i)/(float)LEN6;
      D[i] = (float)(i+3)/(float)LEN6;
   }

   unsigned long long start_c, end_c, diff_c;
   start_c = _rdtsc();

   t6(A,D);

   end_c =_rdtsc(); 
   diff_c = end_c - start_c;
   float giga_cycle = diff_c / 1000000000.0;
   
   float ttt = (float)0.;
   #pragma novector
   for (int i = 0 ; i < LEN6 ; i ++) {
      ttt += A[i];
	}
   printf("t6 took\t %.2f and the result is %f\n", giga_cycle, ttt);
}
Пример #4
0
int main(){
	float* A = (float*) memalign(16, LEN5*sizeof(float));
	float* B = (float*) memalign(16, LEN5*sizeof(float));
	float* C = (float*) memalign(16, LEN5*sizeof(float));
	float* D = (float*) memalign(16, LEN5*sizeof(float));
	float* E = (float*) memalign(16, LEN5*sizeof(float));
	
	
	
	for (int i = 0; i < LEN5; i++){
			A[i] = (float)(i)/(float)LEN5;
			B[i] = (float)(i+1)/(float)LEN5;
			C[i] = (float)(i+2)/(float)LEN5;
		    D[i] = (float)(i+3)/(float)LEN5;
			E[i] = (float)(i+4)/(float)LEN5;
		
		}

	unsigned long long start_c, end_c, diff_c;
	start_c = _rdtsc();

	t5(A,B,C,D,E);

	end_c=_rdtsc(); 
	diff_c = end_c - start_c;
	float giga_cycle = diff_c / 1000000000.0;
	
	float ttt = (float)0.;
        #pragma novector
	for (int i = 0; i < LEN5; i++)
		ttt += A[i];
	printf("t5 took\t %.2f and the result is %f\n", giga_cycle, ttt);
}
Пример #5
0
void
printcounters(struct counter *ctrs, uint64_t duration)
{
    struct metrics s = {0};

    s.timestamp = _rdtsc();
    s.duration = duration;
    // We skip the last core
    int corethreads =0;
    for (int cpu = 1; cpu < gbl.ncpus-3; ++cpu)
    {
        double delta[NEVENTS];
        // volatile because another thread is changing it.
        volatile struct counter *p = &ctrs[cpu];

        for (int i = 0; i < NEVENTS; ++i)
        {
            union {
                __m512d c;
                uint64_t values[8];
            } t;
            t.c = _mm512_load_pd((void *)&p->counts[i][0]);
            delta[i] = perf_scale_delta(t.values, lastctr[cpu].counts[i]);
            _mm512_storenrngo_pd((void *)&lastctr[cpu].counts[i][0], t.c);
            if (delta[i] < 0)
                delta[i] = 0;
            sevents[i] += delta[i];
        }

        if (2*delta[clocks1] > duration)
        {
            s.nthreads += 1;
            corethreads += 1;
        }

        if ((cpu % 4) == 0) // Last thread on this core
        {
            if (corethreads)
                s.ncores += 1;
            corethreads = 0;
        }

        s.vpu_ea += delta[vpu_ea];
        s.instrs += delta[instrs];
        s.vinstrs += delta[vpu_ie];
    }
    uint64_t nreads = 0, nwrites = 0;
    for (int i = 0; i < NGBOXES; ++i)
        for (int j = 0; j < 2; ++j)
        {
            nreads += pmu_rdctr(i, j, 0);
            nwrites += pmu_rdctr(i, j, 1);
        }
    s.rbytes = (nreads - prevnreads) * 64;
    s.wbytes = (nwrites - prevnwrites)* 64;
    prevnreads = nreads;
    prevnwrites = nwrites;

    sample(&s);
}
Пример #6
0
//--------------------------------------------------------------------------------------
// FUNCTION:  RCCE_wtime
//--------------------------------------------------------------------------------------
// clean up at end of library usage (memory unmapping)
//--------------------------------------------------------------------------------------
double RCCE_wtime(void) {
#ifdef SCC
  return ( ((double)_rdtsc())/(RC_REFCLOCKGHZ*1.e9));
#else
  return (omp_get_wtime());
#endif
}
Пример #7
0
// Gets the raw frecuency of the cpu
double* getFrequency(int precision, int tcks) {
	// Precision gives us the amount of times it'll be approximated.
	// Tcks gives us the amount of ticks to try to get the frecuency
	// 2 is the minimum value.
	int it = 0, n = precision, c = 0;
	unsigned long counter = 0;

	// We check limits...
	if (tcks > 18)
		tcks = 18;
	if (tcks < 2)
		tcks = 2;

	// Startup variables
	double cpuFreqs;
	cpuFreq = 0;

	// Iterate for elements.
	for (it = 0; it < n; it++) {
		ticks = 0;
		int oldticks = 1;
		// Waiting for a tick change helps us solve
		// Any redundancy in the numbers
		// "Syncing" the counter with the ticks is really helpful
		while (ticks != oldticks)
			;
		counter = _rdtsc();
		// Wait for another tick change, one is usually enough.
		while (ticks < tcks)
			;
		counter = _rdtsc() - counter;
		// Normalizes to Mhz
		cpuFreqs = counter / ((ticks - 1) * 54925.40115);
		cpuFreq += cpuFreqs;
		if (!fix_flag)
		{
			fix_flag++;
			cpuFreq = 0;
			it--;
		}
	}

	// Average if needed.
	cpuFreq /= n;
	return &cpuFreq;
}
Пример #8
0
Файл: main.c Проект: jeez/qmsi
/*
 * Returns the start and end RTC times for this busy loop.
 * Ideally, by examining the TSC and RTC times, we should be able to
 * identify their correlation.
 */
static uint64_t spin_loop(unsigned int count, unsigned int *rtc_start,
			  unsigned int *rtc_end)
{
	uint64_t start_tsc;

retry:
	*rtc_start = QM_RTC[QM_RTC_0].rtc_ccvr;
	start_tsc = _rdtsc();
	clk_sys_udelay(400);
	*rtc_end = QM_RTC[QM_RTC_0].rtc_ccvr;

	if ((*rtc_end < *rtc_start) &&
	    (!((*rtc_start & 0xF0000000) == 0xF0000000))) {
		goto retry;
	}
	return _rdtsc() - start_tsc;
}
Пример #9
0
void
printcounters(struct counter *ctrs, uint64_t duration)
{
    struct metrics s = {0};

    uint64_t thisBytesWritten = pcm->bytesWritten();
    uint64_t thisBytesRead = pcm->bytesRead();
    memset(threadspercore, 0, gbl.ncores * sizeof(int));
    s.timestamp = _rdtsc();
    s.duration = duration;
    for (int cpu = 0; cpu < gbl.ncpus; ++cpu)
    {
        double delta[NEVENTS];
        // volatile because another thread is changing it.
        volatile struct counter *p = &ctrs[cpu];

        for (int i = 0; i < NEVENTS; ++i)
        {
            union {
                __m256d c;
                uint64_t values[4];
            } t;
            t.c = _mm256_load_pd((const double *)&p->counts[i][0]);
            delta[i] = perf_scale_delta(t.values, lastctr[cpu].counts[i]);
            _mm256_store_pd((double *)&lastctr[cpu].counts[i][0], t.c);
            if (delta[i] < 0)
                delta[i] = 0;
            sevents[i] += delta[i];
        }

        //printf("clocks %g duration %lu\n", delta[clocks], duration);
        if (2*delta[clocks] > duration)
        {
            int thiscore = pcm->getSocketId(cpu)  * gbl.corespersocket +
                pcm->getCoreId(cpu);
            ++s.nthreads;
            ++threadspercore[thiscore];
        }
        s.dsimd += delta[simd_dp];
        s.dsse += delta[sse_dp];
        s.dscalar += delta[scalar_dp];
        s.ssimd += delta[simd_sp];
        s.ssse += delta[sse_sp];
        s.sscalar += delta[scalar_sp];
        s.instrs += delta[instrs];
    }
    s.rbytes = thisBytesRead - lastBytesRead;
    s.wbytes = thisBytesWritten - lastBytesWritten;
    lastBytesRead = thisBytesRead;
    lastBytesWritten = thisBytesWritten;
    for (int i = 0; i < gbl.ncores; ++i)
        if (threadspercore[i])
            ++s.ncores;

    sample(&s);
}
Пример #10
0
unsigned __int64* _perf_start(void)
{
    unsigned __int64* stime;
#pragma omp critical
  {
    stime = malloc(sizeof(*stime));
    *stime = _rdtsc();
  }
  return stime;
}
Пример #11
0
void _perf_end(unsigned __int64 *stime, int index)
{
  *stime = _rdtsc() - *stime;
#pragma omp critical
  {
	perfsum[index] += (double)(*stime);
	perfcount[index] ++;
  }
  free(stime);
}
Пример #12
0
static void rdtsc_calibrate(){
	uint64 t1, t2;
	int32 i;
	
	ShowStatus("Calibrating Timer Source, please wait... ");
	
	RDTSC_CLOCK = 0;
	
	for(i = 0; i < 5; i++){
		t1 = _rdtsc();
		usleep(1000000); //1000 MS
		t2 = _rdtsc();
		RDTSC_CLOCK += (t2 - t1) / 1000; 
	}
	RDTSC_CLOCK /= 5;
	
	RDTSC_BEGINTICK = _rdtsc();	
	
	ShowMessage(" done. (Frequency: %u Mhz)\n", (uint32)(RDTSC_CLOCK/1000) );
}
Пример #13
0
void * driver0(void * arg)
{
	int i,j,k, iter_count =0;
	uint64_t line_count=0, init_tsc, end_tsc;
	size_t * read_pntr;

	read_pntr = array;
// pin core affinity
	if(pin_cpu(pid, cpu_read) == -1) {
		err(1,"cannot set cpu read affinity");
		}
	else{
		printf(" read thread pinned to core %d to run\n",cpu_read);
		}
	fprintf(stderr,"total_lines = %ld\n",total_lines);

	read_sum_tsc = 0;
	while(line_count < total_lines)
		{
		i = 0;
		while(exchange_flag == 0)
			{
			i++;
			}
		init_tsc = _rdtsc();
//		if(iter_count < 10)fprintf(stderr,"reader calling kernel\n");
		read_pntr = read_buf(seg_size, read_pntr);
//		if(iter_count < 10)fprintf(stderr,"reader returned from kernel\n");
		end_tsc = _rdtsc();
		read_sum_tsc += (end_tsc - init_tsc);
		line_count += seg_size;
		iter_count++;
		exchange_flag = 0;
		}
	fprintf(stderr," from read thread, line_count = %ld, TSC sum = %lu, latency = %g\n",
			line_count, read_sum_tsc,(double)read_sum_tsc/(double)line_count);
	pthread_exit(NULL);
}
Пример #14
0
int main() {
	float* A = (float*) _mm_malloc(1024*sizeof(float), 16);
	float* B = (float*) _mm_malloc(1024*sizeof(float), 16);
	for (int i = 0 ; i < 1024 ; i ++){
		A[i] = 1. / (i+1);
		B[i] = 2. / (i+1);
	}

	unsigned long long start_c, end_c, diff_c;
	start_c = _rdtsc();

	t1(A,B);

	end_c =_rdtsc();
	diff_c = end_c - start_c;
	float giga_cycle = diff_c / 1000000000.0;
	float ret = 0;

	for (int i = 0; i < 1024; i ++) {
		ret += A[i];
	}
	printf("t1 took %f giga cycles and the result is: %f\n", giga_cycle, ret);
}
Пример #15
0
void * reader(void * ev)
{
  int tid = (int) ev;
  __int64 tsc;
  while (1) { // !done) {
    __int64 value;
    value = GET();
    tsc = _rdtsc();
    printf("%d: Got delta %f, size=%d\n",tid, ((double) (tsc - value)) / (double) 3000000000.0,GETSIZE());
    usleep(read_sleep);
  }
  printf("Thread exiting\n");
  return(NULL);
}
Пример #16
0
RingBufferEntry*
allocEntry(RingBufferType t) {
  ASSERT(Util::isPowerOfTwo(kMaxRBEntries));
  RingBufferEntry* rb;
  int newRingPos, oldRingPos;
  do {
    oldRingPos = g_ringIdx;
    rb = &g_ring[oldRingPos];
    newRingPos = (oldRingPos + 1) % kMaxRBEntries;
  } while (!atomic_cas(&g_ringIdx, oldRingPos, newRingPos));
  rb->m_ts = uint32_t(_rdtsc());
  rb->m_type = t;
  rb->m_threadId = (uint32_t)((int64)pthread_self() & 0xFFFFFFFF);
  return rb;
}
Пример #17
0
unsigned int getnowtime()
{
#if defined PLATFORM_WINDOWS
    return GetTickCount();
#elif defined(ENABLE_RDTSC)
    return (unsigned int)((_rdtsc() - RDTSC_BEGINTICK) / RDTSC_CLOCK);
#elif (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0 && defined(_POSIX_MONOTONIC_CLOCK) /* posix compliant */) || (defined(__FreeBSD_cc_version) && __FreeBSD_cc_version >= 500005 /* FreeBSD >= 5.1.0 */)
    struct timespec tval;
    clock_gettime(CLOCK_MONOTONIC, &tval);
    return tval.tv_sec * 1000 + tval.tv_nsec / 1000000;
#else
    struct timeval tval;
    gettimeofday(&tval, NULL);
    return tval.tv_sec * 1000 + tval.tv_usec / 1000;
#endif
}
Пример #18
0
int64_t
ox_getnowtime(void)
{
#if defined PLATFORM_WINDOWS
    int64_t second = time(NULL);
    SYSTEMTIME sys;
    GetLocalTime(&sys);
    return second*1000 + sys.wMilliseconds;
#elif defined(ENABLE_RDTSC)
    return (unsigned int)((_rdtsc() - RDTSC_BEGINTICK) / RDTSC_CLOCK);
#elif (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0 && defined(_POSIX_MONOTONIC_CLOCK) /* posix compliant */) || (defined(__FreeBSD_cc_version) && __FreeBSD_cc_version >= 500005 /* FreeBSD >= 5.1.0 */)
    struct timespec tval;
    clock_gettime(CLOCK_MONOTONIC, &tval);
    return tval.tv_sec * 1000 + tval.tv_nsec / 1000000;
#else
    struct timeval tval;
    gettimeofday(&tval, NULL);
    return tval.tv_sec * 1000 + tval.tv_usec / 1000;
#endif
}
Пример #19
0
void * writer(void * ev)
{
  int tid = (int) ev;
  int i,j;
  sleep (1);
  for (i = 0; i < iter; i++) {
    __int64 val;
    if ((rand() % sleep_frac) == 0) {
      usleep(write_sleep);
    }

    val = _rdtsc(); // rand();
    printf("%d:%d Put size=%d\n",tid, i, GETSIZE());

    PUT(val);
  }
  cond_begin;
  num_write --;
  if (num_write  == 0) {
    cond_event_tm_signal(&done_cond);
  }
  cond_end;
  return(NULL);
}
Пример #20
0
void
setup_output()
{
    char hdr[1024];
    hdr[sizeof(hdr)-1] = 0;
    hdr[0] = 0;

    lastBytesRead = pcm->bytesRead();
    lastBytesWritten = pcm->bytesWritten();
    threadspercore = new int[gbl.ncores];
    labels[fThreads].max = gbl.ncpus;
    labels[fInst].max = gbl.ncores * 4 * gbl.hz;
    labels[fFlopsSP].max = gbl.ncores * 8 * 2 * gbl.hz;
    labels[fFlopsDP].max = gbl.ncores * 4 * 2 * gbl.hz;

    hdrLabels[0] = "Time";
    hdrLabels[1] = "Threads";
    hdrIndexes[0] - fTime;
    hdrIndexes[1] = fThreads;
    int j = 2;
    for (int i = 0; i < nfields; ++i)
        if (i != fTime && i != fThreads)
        {
            hdrLabels[j] = labels[i].name;
            hdrIndexes[j] = i;
            ++j;
        }
    bool first = true;
    for (int i = 0; i < nfields; ++i)
    {
        if (first)
            first = false;
        else
            strncat(hdr, ",", sizeof(hdr)-1);
        strncat(hdr, hdrLabels[i], sizeof(hdr)-1);
    }
    strncat(hdr, "\n", sizeof(hdr)-1);

    if (gbl.server)
    {
        setup_server();
    }
    else if (gbl.outfile)
    {
        outfile = fopen(gbl.outfile, "w");
        if (outfile == NULL)
            err(1, "create %s", gbl.outfile);
    }

    gbl.hdr[0] = 0;
    gbl.hdr[sizeof(gbl.hdr)-1] = 0;
    if (gbl.server)
    {
        for (int i = 0; i < nfields; ++i)
            if (i != fTime)
                snprintf(gbl.hdr+strlen(gbl.hdr), sizeof(gbl.hdr)-1,
                    "%s=%g,%s,%g\n",
                    labels[i].name,
                    labels[i].max,
                    labels[i].units,
                    labels[i].factor);
        strncat(gbl.hdr, hdr, sizeof(gbl.hdr)-1);
    }
    else if (outfile)
    {
        for (int i = 0; i < nfields; ++i)
            if (i != fTime)
                fprintf(outfile, "%s=%g,%s,%g\n",
                    labels[i].name,
                    labels[i].max,
                    labels[i].units,
                    labels[i].factor);
        fprintf(outfile, hdr);
    }

    starttsc = _rdtsc();
}
Пример #21
0
int main(int argc, char* argv[])
{
    double
	sTime, eTime;

    double sum_delta  = 0.0;
    double sum_ref    = 0.0;
    double max_delta  = 0.0;
    double sumReserve = 0.0;

    printf("Monte Carlo European Option Pricing Single Precision\n\n");
    printf("Compiler Version  = %d\n", __INTEL_COMPILER/100);
    printf("Release Update    = %d\n", __INTEL_COMPILER_UPDATE);
    printf("Build Time        = %s %s\n", __DATE__, __TIME__);
    printf("Path Length       = %d\n", RAND_N);
    printf("Number of Options = %d\n", OPT_N);
    printf("Block Size        = %d\n", RAND_BLOCK_LENGTH);
    printf("Worker Threads    = %d\n\n", NTHREADS);

    const int mem_size  = sizeof(float)*OPT_PER_THREAD;

#ifndef _OPENMP
    NTHREADS = 1;
#endif

    float *samples[MAX_THREADS];
    VSLStreamStatePtr Streams[MAX_THREADS];
    const int nblocks = RAND_N/RAND_BLOCK_LENGTH;
#pragma omp parallel reduction(+ : sum_delta) reduction(+ : sum_ref) reduction(+ : sumReserve) reduction(max : max_delta)
{
#ifdef _OPENMP
    int threadID = omp_get_thread_num();
#else
    int threadID = 0;
#endif
    unsigned int randseed = RANDSEED + threadID;
    srand(randseed);
    float *CallResultList     = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN);
    float *CallConfidenceList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN);
    float *StockPriceList     = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN);
    float *OptionStrikeList   = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN);
    float *OptionYearsList    = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN);
    for(int i = 0; i < OPT_PER_THREAD; i++)
    {
        CallResultList[i]     = 0.0f;
        CallConfidenceList[i] = 0.0f;
        StockPriceList[i]     = RandFloat_T(5.0f, 50.0f, &randseed);
        OptionStrikeList[i]   = RandFloat_T(10.0f, 25.0f, &randseed);
        OptionYearsList[i]    = RandFloat_T(1.0f, 5.0f, &randseed);
    }
    samples[threadID] = (float *)scalable_aligned_malloc(RAND_BLOCK_LENGTH * sizeof(float), SIMDALIGN);
    vslNewStream(&(Streams[threadID]), VSL_BRNG_MT2203 + threadID, RANDSEED);

#pragma omp barrier
    if (threadID == 0)
    {
        printf("Starting options pricing...\n");
        sTime = second();
        start_cyc = _rdtsc();
    }

    for(int opt = 0; opt < OPT_PER_THREAD; opt++)
    {
        const float VBySqrtT = VLog2E * sqrtf(OptionYearsList[opt]);
	const float MuByT    = MuLog2E * OptionYearsList[opt];
        const float Y        = StockPriceList[opt];
        const float Z        = OptionStrikeList[opt];
		            
        float v0 = 0.0f;
        float v1 = 0.0f;
        for(int block = 0; block < nblocks; ++block)
        {
            float *rand = samples[threadID];
            vsRngGaussian (VSL_RNG_METHOD_GAUSSIAN_ICDF, Streams[threadID], RAND_BLOCK_LENGTH, rand, MuByT, VBySqrtT); 
#pragma vector aligned
#pragma simd reduction(+:v0) reduction(+:v1)
#pragma unroll(4)
            for(int i=0; i < RAND_BLOCK_LENGTH; i++) 
            {
                float callValue  = Y * exp2f(rand[i]) - Z;
                callValue = (callValue > 0.0) ? callValue : 0.0;
                v0 += callValue;
                v1 += callValue * callValue;
            }
        }
        const float  exprt      = exp2f(RLog2E*OptionYearsList[opt]);
        CallResultList[opt]     = exprt * v0 * INV_RAND_N;
        const float  stdDev     = sqrtf((F_RAND_N * v1 - v0 * v0) * STDDEV_DENOM);
        CallConfidenceList[opt] = (float)(exprt * stdDev * CONFIDENCE_DENOM);
    } //end of opt 

#pragma omp barrier
    if (threadID == 0) {
        end_cyc = _rdtsc();
        eTime = second();
        printf("Parallel simulation completed in %f seconds.\n", eTime-sTime);
        printf("Validating the result...\n");
    }

    double delta = 0.0, ref = 0.0, L1norm = 0.0;
    int max_index = 0;
    double max_local  = 0.0;
    for(int i = 0; i < OPT_PER_THREAD; i++)
    {
        double callReference, putReference;
        BlackScholesBodyCPU(
            callReference,
            putReference,
            StockPriceList[i],
            OptionStrikeList[i], OptionYearsList[i],  RISKFREE, VOLATILITY );
        ref   = callReference;
        delta = fabs(callReference - CallResultList[i]);
        sum_delta += delta;
        sum_ref   += fabs(ref);
        if(delta > 1e-6)
             sumReserve += CallConfidenceList[i] / delta;
        max_local = delta>max_local? delta: max_local;
    }
    max_delta = max_local>max_delta? max_local: max_delta;
    vslDeleteStream(&(Streams[threadID]));
    scalable_aligned_free(samples[threadID]);
    scalable_aligned_free(CallResultList);
    scalable_aligned_free(CallConfidenceList);
    scalable_aligned_free(StockPriceList);
    scalable_aligned_free(OptionStrikeList);
    scalable_aligned_free(OptionYearsList);
}//end of parallel block

    sumReserve          /= (double)OPT_N;
    const double L1norm  = sum_delta / sum_ref;

    printf("L1_Norm          = %4.3E\n", L1norm);
    printf("Average RESERVE  = %4.3f\n", sumReserve);
    printf("Max Error        = %4.3E\n", max_delta);

    const unsigned long long cyc       = end_cyc - start_cyc;
    const double             optcyc    = (double)cyc/(double)OPT_N;

    printf("==========================================\n");
    printf("Total Cycles = %lld\n", cyc);
    printf("Cyc/opt      = %8.3f\n", optcyc);
    printf("Time Elapsed = %8.3f\n", eTime-sTime);
    printf("Options/sec  = %8.3f\n", OPT_N/(eTime-sTime));
    printf("==========================================\n");
    return 0;
}
Пример #22
0
int main(void){
	/* stack buffers */
	char sbuf1[SHORTBUF];
	char sbuf2[LONGBUF];
	
	/* heap buffers */
	char *hbuf1 = NULL;
	char *hbuf2 = NULL;

	uint64_t cycles;
	int i;

	hbuf1 = (char *)malloc(SHORTBUF);
	hbuf2 = (char *)malloc(LONGBUF);

	if((!hbuf1) || (!hbuf2)){
		fprintf(stderr, "malloc failed\n");
		exit(EXIT_FAILURE);
	}
	
	//just for load libc addr
	bzero(sbuf1, SHORTBUF);
	
	/* ------ test short buffers ------ */
	cycles = _rdtsc();
	for(i = NSAMPLES; i > 0; i--){
		bzero(sbuf1, SHORTBUF);
	}
	cycles = _rdtsc() - cycles;
	printf("[STACK] [%d] bzero:     %" PRIu64 " cycles\n", SHORTBUF, cycles);
	
	cycles = _rdtsc();
	for(i = NSAMPLES; i > 0; i--){
		bzero(hbuf1, SHORTBUF);
	}
	cycles = _rdtsc() - cycles;
	printf("[HEAP]  [%d] bzero:     %" PRIu64 " cycles\n", SHORTBUF, cycles);
	
	cycles = _rdtsc();
	for(i = NSAMPLES; i > 0; i--){
		my_bzero(sbuf1, SHORTBUF);
	}
	cycles = _rdtsc() - cycles;
	printf("[STACK] [%d] my_bzero:  %" PRIu64 " cycles\n", SHORTBUF, cycles);
	
	cycles = _rdtsc();
	for(i = NSAMPLES; i > 0; i--){
		my_bzero(hbuf1, SHORTBUF);
	}
	cycles = _rdtsc() - cycles;
	printf("[HEAP]  [%d] my_bzero:  %" PRIu64 " cycles\n", SHORTBUF, cycles);
	
	puts("");
	
	/* ------ test long buffers ------ */
	cycles = _rdtsc();
	for(i = NSAMPLES; i > 0; i--){
		bzero(sbuf2, LONGBUF);
	}
	cycles = _rdtsc() - cycles;
	printf("[STACK] [%d] bzero:    %" PRIu64 " cycles\n", LONGBUF, cycles);
	
	cycles = _rdtsc();
	for(i = NSAMPLES; i > 0; i--){
		bzero(hbuf2, LONGBUF);
	}
	cycles = _rdtsc() - cycles;
	printf("[HEAP]  [%d] bzero:    %" PRIu64 " cycles\n", LONGBUF, cycles);
	
	cycles = _rdtsc();
	for(i = NSAMPLES; i > 0; i--){
		my_bzero(sbuf2, LONGBUF);
	}
	cycles = _rdtsc() - cycles;
	printf("[STACK] [%d] my_bzero: %" PRIu64 " cycles\n", LONGBUF, cycles);
	
	cycles = _rdtsc();
	for(i = NSAMPLES; i > 0; i--){
		my_bzero(hbuf2, LONGBUF);
	}
	cycles = _rdtsc() - cycles;
	printf("[HEAP]  [%d] my_bzero: %" PRIu64 " cycles\n", LONGBUF, cycles);
	
	free(hbuf1);
	free(hbuf2);

	hbuf1 = NULL;
	hbuf2 = NULL;

	return 0;
}
Пример #23
0
void 
main(int argc, char ** argv)
{
	double *a, *b, *c, xx=0.01, bw, avg_bw, best_bw=-1.0;
	char * buf1, *buf2, *buf3;
	int i,j,k,offset_a=0,offset_b=0,offset_c=0, mult=1,iter=1000, c_val;
	int len,num_pages, num_lines, cpu_run,scale;
	u64 start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0;
	__pid_t pid=0;
	int cpu_setsize;
	cpu_set_t mask;
	int *buff;
	size_t buf_size;
	off_t offset = 0;
	int fd = -1;

//	process input arguments

	if(argc < 3 ){
		printf("affinity needs 2 arguments, cpu_run, call count multiplier  def = 1\n");
		printf(" argc = %d\n",argc);
		usage();
		err(1, "bad arguments");
		}


        while ((c_val = getopt(argc, argv, "i:r:l:m:a:b:c")) != -1) {
                switch(c_val) {
                case 'r':
                        cpu_run = atoi(optarg);
                        break;
                case 'm':
                        mult = atoi(optarg);
                        break;
                default:
                        err(1, "unknown option %c", c_val);
                }
        }



// pin core affinity for initialization
        if(pin_cpu(pid, cpu_run) == -1) {
                err(1,"failed to set affinity");
                }
        else{
                fprintf(stderr," process pinned to core %d for triad run\n",cpu_run);
                }


// set buffer sizes and loop tripcount
	buf_size = (u64)4096*(u64)num_pages;
	num_lines=64*num_pages;
        iter = iter*mult;

// malloc and initialize buffers

	printf(" starting malloc loop of %d iterations with buf_size = %ld, num_lines = %d\n",iter,buf_size, num_lines);
	call_start = _rdtsc();
	for(i=0;i<iter;i++){
		start = _rdtsc();
	        if(pin_cpu(pid, cpu_run) == -1) {
        	        err(1,"failed to set affinity");
                	}
	        else{
        	        fprintf(stderr," process pinned to core %d for triad run\n",cpu_run);
                	}
		stop = _rdtsc();
		run_time = stop - start;
		}
	call_stop = _rdtsc();
	call_run_time = call_stop - call_start;
//  printout
	printf(" allocating %lld bytes and initializing and freeing took %lld cycles\n",(u64)len*(u64)iter,run_time);
}
Пример #24
0
int main(int argc, char *argv[]) {
    sdl_state SDLState = {};

    platform_work_queue HighPriorityQueue = {};
    SDLMakeQueue(&HighPriorityQueue, 6);

    platform_work_queue LowPriorityQueue = {};
    SDLMakeQueue(&LowPriorityQueue, 2);


    GlobalPerfCountFrequency = SDL_GetPerformanceFrequency();

    SDLGetEXEFileName(&SDLState);

    char SourceGameCodeDLLFullpath[SDL_STATE_FILE_NAME_COUNT];
    SDLBuildEXEPathFileName(&SDLState, "handmade.dylib",
                            sizeof(SourceGameCodeDLLFullpath), SourceGameCodeDLLFullpath);

    char TempGameCodeDLLFullpath[SDL_STATE_FILE_NAME_COUNT];
    SDLBuildEXEPathFileName(&SDLState, "handmade_temp.dylib",
                            sizeof(TempGameCodeDLLFullpath), TempGameCodeDLLFullpath);

    char GameCodeLockFullpath[SDL_STATE_FILE_NAME_COUNT];
    SDLBuildEXEPathFileName(&SDLState, "lock.tmp",
                            sizeof(GameCodeLockFullpath), GameCodeLockFullpath);

    if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO) != 0) {
        printf("Failed to initialize SDL: %s\n", SDL_GetError());
        return -1;
    }

    SDL_Window *Window = SDL_CreateWindow("Handmade Hero",
                                          SDL_WINDOWPOS_CENTERED,
                                          SDL_WINDOWPOS_CENTERED,
                                          960, 540,
                                          SDL_WINDOW_OPENGL);
    if (!Window) {
        printf("Failed to create window: %s\n", SDL_GetError());
        return -1;
    }

    SDLResizeDIBSection(Window, &GlobalBackBuffer, 960, 540);

    // TODO: Set GameUpdateHz by monitor refresh HZ
    real32 GameUpdateHz = 60.0f;
    real32 TargetSecondsPerFrame = 1.0f / GameUpdateHz;

    sdl_sound_output SoundOutput = {};
    SoundOutput.SamplesPerSecond = 48000;
    SoundOutput.BytesPerSample = sizeof(int16) * 2;
    SoundOutput.BufferSize = SoundOutput.SamplesPerSecond * SoundOutput.BytesPerSample;

    SDL_AudioDeviceID Audio = SDLInitSound(SoundOutput.SamplesPerSecond);

    u32 MaxPossibleOverrun = 2 * 4 * sizeof(u16);
    int16 *Samples = (int16 *)mmap(0, (size_t)(SoundOutput.BufferSize + MaxPossibleOverrun),
                                   PROT_READ | PROT_WRITE,
                                   MAP_PRIVATE | MAP_ANON, -1, 0);

    GlobalRunning = true;

#if HANDMADE_INTERNAL
    void *BaseAddress = (void *)Terabytes(2);
#else
    void *BaseAddress = 0;
#endif

    game_memory GameMemory = {};
    GameMemory.PermanentStorageSize = Megabytes(64);
    GameMemory.TransientStorageSize = Gigabytes(256);
    GameMemory.HighPriorityQueue = &HighPriorityQueue;
    GameMemory.LowPriorityQueue = &LowPriorityQueue;

    GameMemory.PlatformAPI.AddEntry = SDLAddEntry;
    GameMemory.PlatformAPI.CompleteAllWork = SDLCompleteAllWork;

    GameMemory.PlatformAPI.GetAllFilesOfTypeBegin = SDLGetAllFilesOfTypeBegin;
    GameMemory.PlatformAPI.GetAllFilesOfTypeEnd = SDLGetAllFilesOfTypeEnd;
    GameMemory.PlatformAPI.OpenNextFile = SDLOpenNextFile;
    GameMemory.PlatformAPI.ReadDataFromFile = SDLReadDataFromFile;
    GameMemory.PlatformAPI.FileError = SDLFileError;

    GameMemory.PlatformAPI.AllocateMemory = SDLAllocateMemory;
    GameMemory.PlatformAPI.DeallocateMemory = SDLDeallocateMemory;

    GameMemory.PlatformAPI.DEBUGFreeFileMemory = DEBUGPlatformFreeFileMemory;
    GameMemory.PlatformAPI.DEBUGReadEntireFile = DEBUGPlatformReadEntireFile;
    GameMemory.PlatformAPI.DEBUGWriteEntireFile = DEBUGPlatformWriteEntireFile;

    SDLState.TotalSize = GameMemory.PermanentStorageSize + GameMemory.TransientStorageSize;

    SDLState.GameMemoryBlock = mmap(BaseAddress,
                                    (size_t) SDLState.TotalSize,
                                    PROT_READ | PROT_WRITE,
                                    MAP_PRIVATE | MAP_ANON, -1, 0);
    GameMemory.PermanentStorage = SDLState.GameMemoryBlock;
    GameMemory.TransientStorage = ((uint8 *) GameMemory.PermanentStorage + GameMemory.PermanentStorageSize);

    if (!(GameMemory.PermanentStorage && GameMemory.TransientStorage)) {
        printf("Failed to allocate game memory\n");
        return -1;
    }

    // TODO: Add game replay support here

    game_input Input[2] = {};
    game_input *NewInput = &Input[0];
    game_input *OldInput = &Input[1];

    uint64 LastCounter = SDL_GetPerformanceCounter();
    uint64 FlipWallClock = SDL_GetPerformanceCounter();

    sdl_game_code Game = SDLLoadGameCode(SourceGameCodeDLLFullpath,
                                         TempGameCodeDLLFullpath,
                                         GameCodeLockFullpath);

    uint64 LastCycleCount = _rdtsc();

    while (GlobalRunning) {
        NewInput->dtForFrame = TargetSecondsPerFrame;

        NewInput->ExecutableReloaded = false;
        time_t NewDLLWriteTime = SDLGetLastWriteTime(SourceGameCodeDLLFullpath);
        if (difftime(NewDLLWriteTime, Game.DLLLastWriteTime) > 0) {
            SDLCompleteAllWork(&HighPriorityQueue);
            SDLCompleteAllWork(&LowPriorityQueue);

            SDLUnloadGameCode(&Game);
            Game = SDLLoadGameCode(SourceGameCodeDLLFullpath,
                                   TempGameCodeDLLFullpath,
                                   GameCodeLockFullpath);
            NewInput->ExecutableReloaded = true;
        }

        game_controller_input *OldKeyboardController = GetController(OldInput, 0);
        game_controller_input *NewKeyboardController = GetController(NewInput, 0);
        *NewKeyboardController = {};
        NewKeyboardController->IsConnected = true;
        for (size_t ButtonIndex = 0; ButtonIndex < ArrayCount(NewKeyboardController->Buttons); ++ButtonIndex) {
            NewKeyboardController->Buttons[ButtonIndex].EndedDown = OldKeyboardController->Buttons[ButtonIndex].EndedDown;
        }

        SDLProcessPendingMessage(&SDLState, NewKeyboardController);

        if (!GlobalPause) {
            Uint32 MouseButtons = SDL_GetMouseState(&NewInput->MouseX, &NewInput->MouseY);
            NewInput->MouseZ = 0;
            SDLProcessKeyboardMessage(&NewInput->MouseButtons[0],
                                      SDL_BUTTON(SDL_BUTTON_LEFT));
            SDLProcessKeyboardMessage(&NewInput->MouseButtons[1],
                                      SDL_BUTTON(SDL_BUTTON_MIDDLE));
            SDLProcessKeyboardMessage(&NewInput->MouseButtons[2],
                                      SDL_BUTTON(SDL_BUTTON_RIGHT));
            SDLProcessKeyboardMessage(&NewInput->MouseButtons[3],
                                      SDL_BUTTON(SDL_BUTTON_X1));
            SDLProcessKeyboardMessage(&NewInput->MouseButtons[4],
                                      SDL_BUTTON(SDL_BUTTON_X2));

            // TODO: Handle Mouse button here

            // TODO: Game controller support here

            game_offscreen_buffer Buffer = {};
            Buffer.Memory = GlobalBackBuffer.Memory;
            Buffer.Width = GlobalBackBuffer.Width;
            Buffer.Height = GlobalBackBuffer.Height;
            Buffer.Pitch = GlobalBackBuffer.Pitch;

            if (Game.UpdateAndRender) {
                Game.UpdateAndRender(&GameMemory, NewInput, &Buffer);
                HandleDebugCycleCounters(&GameMemory);
            }

            // TODO: Game audio support here
            game_sound_output_buffer SoundBuffer = {};
            SoundBuffer.SamplesPerSecond = SoundOutput.SamplesPerSecond;
            SoundBuffer.SampleCount = Align8((u32)(SoundOutput.SamplesPerSecond * TargetSecondsPerFrame));
            SoundBuffer.Samples = Samples;
            if (Game.GetSoundSamples) {
                Game.GetSoundSamples(&GameMemory, &SoundBuffer);
                SDL_QueueAudio(Audio, SoundBuffer.Samples, SoundBuffer.SampleCount * SoundOutput.BytesPerSample);
            }

            SDLDisplayBufferInWindow(&GlobalBackBuffer);

            game_input *Temp = NewInput;
            NewInput = OldInput;
            OldInput = Temp;
        }
    }

    return 0;
}
Пример #25
0
int
main(void) {
    uint64 PerfCountFrequency = SDL_GetPerformanceFrequency();
    SDL_Event Event;
    SDL_Window *Window;
    SDL_Renderer *Renderer;

    if(SDL_Init(SDL_INIT_VIDEO | SDL_INIT_GAMECONTROLLER | SDL_INIT_HAPTIC) != 0) {
        fprintf(stderr, "Could not initialize SDL: %s\n", SDL_GetError());
        return -1;
    }

    atexit(SDL_Quit);

    int WindowWidth = 1300;
    int WindowHeight = 870;
    int BytesPerPixel = 4;

    Window = SDL_CreateWindow("Echelon",
                              0, 0,
                              WindowWidth, WindowHeight,
                              SDL_WINDOW_RESIZABLE);
    
    if(Window) {
        Renderer = SDL_CreateRenderer(Window, -1, 0);

        if(Renderer) {
            GlobalRunning = true;
            window_dimensions Dimensions = SDLGetWindowDimensions(Window);
            SDLCreateNewTexture(&GlobalBuffer,
                                Renderer,
                                Dimensions.Width, Dimensions.Height,
                                BytesPerPixel);

            uint64 LastCounter = SDL_GetPerformanceCounter();
            uint64 LastCycleCount = _rdtsc();

            real64 DebugTimer = 0;
            real64 FPSTimer = 0;
            real64 UpdateTimer = 0;

            uint32 FPS = 0;
            uint32 UPS = 0;
            
            keyboard_input KeyboardInput = {};
            gamepad_input GamePadInput = {};

            game_code Game = LoadGameCode();

            game_memory GameMemory = {};
            GameMemory.IsInitialized = false;
            GameMemory.PlayRumble = SDLPlayRumble;
            GameMemory.WindowDimensions = SDLGetWindowDimensions(Window);
            GameMemory.PlatformDrawRenderQueue = DrawRenderQueueStub;
            GameMemory.PermanentStorageSize = Megabytes(100);
            GameMemory.PermanentStorage = mmap(0,
                                               GameMemory.PermanentStorageSize,
                                               PROT_READ | PROT_WRITE,
                                               MAP_PRIVATE | MAP_ANONYMOUS,
                                               -1, 0);
            
            GameMemory.TransientStorageSize = Gigabytes(2);
            GameMemory.TransientStorage = mmap(0,
                                               GameMemory.TransientStorageSize,
                                               PROT_READ | PROT_WRITE,
                                               MAP_PRIVATE | MAP_ANONYMOUS,
                                               -1, 0);

            Game.GameInit(&GameMemory);
            
            while(GlobalRunning) {
                // NOTE(Redab): This needs to while loop because we need
                // to handle events as long as they are available.
                while(SDL_PollEvent(&Event)) {
                    SDLHandleEvent(&Event, &Dimensions);
                    SDLHandleUserInput(&Event, &Game, &GameMemory, &KeyboardInput, &GamePadInput);
                }

                uint64 EndCycleCount = _rdtsc();
                uint64 EndCounter = SDL_GetPerformanceCounter();
                uint64 CounterElapsed = EndCounter - LastCounter;
                uint64 CyclesElapsed = EndCycleCount - LastCycleCount;

                // NOTE(Redab): CounterElapsed Contains the number of
                // clock cycles since last check. So we need to divide
                // this by the number of cycles per second which we
                // have in PerCountFrequency. Multiplied by 1000 to
                // get milliseconds.

                real64 SecondsPerFrame = ((real64)CounterElapsed / (real64)PerfCountFrequency);
                real64 MSPerFrame = SecondsPerFrame * 1000.0f;
                
                real64 KCPF = ((real64)CyclesElapsed / (1000.0f));

                FPSTimer += MSPerFrame;
                UpdateTimer += MSPerFrame;
                DebugTimer += MSPerFrame;

                if(UpdateTimer >= (1000.0f / 60.0f)) {
                    GameMemory.WindowDimensions = Dimensions;                        
                    Game.GameUpdate(&GameMemory, &KeyboardInput, &GamePadInput, UpdateTimer / 1000.0f);

                    UPS++;
                    UpdateTimer = 0;
                }

                if(FPSTimer >= (1000.0f / 60.0f)) {
                    SDLGameRender(&GlobalBuffer, &Game, &GameMemory);
                    SDLBlitFrameToWindow(&GlobalBuffer, Renderer);
                    
                    FPS++;
                    FPSTimer = 0;
                }
                
                if(DebugTimer >= 1000.0f) {
                    printf("%.05fms/f, FPS: %d, UPS: %d, %.02fKc/f, Timer: %.02f\n",
                           MSPerFrame, FPS, UPS, KCPF, DebugTimer);

                    FPS = 0;
                    UPS = 0;
                    DebugTimer = 0;
                }

                LastCycleCount = EndCycleCount;
                LastCounter = EndCounter;
            }
        } else {
            printf("Failed to create SDL_Renderer: %s\n", SDL_GetError());
        }
    } else {
        printf("Failed to create SDL_Window: %s\n", SDL_GetError());
    }

    SDL_CloseAudio();
    SDL_Quit();
    return 0;
}
Пример #26
0
void
setup_output()
{
    char hdr[1024];
    hdr[sizeof(hdr)-1] = 0;

    pmu_init();
    pmu_start();
    for (int i = 0; i < NGBOXES; ++i)
        for (int j = 0; j < 2; ++j)
        {
            prevnreads += pmu_rdctr(i, j, 0);
            prevnwrites += pmu_rdctr(i, j, 1);
        }
    double ibw = (gbl.ncores-1) * 2 * gbl.hz;
    labels[fInst].max = ibw;
    labels[fVPU].max = ibw;
    double vop = (gbl.ncores-1) * 8 * gbl.hz;
    labels[fVpuSP].max = 2*vop;
    labels[fVpuDP].max = vop;

    // order doesn't really matter but we're used to this and it's better
    // for Excel
    hdrLabels[0] = "Time";
    hdrLabels[1] = "Threads";
    hdrIndexes[0] = fTime;
    hdrIndexes[1] = fThreads;
    int j = 2;
    for (int i = 0; i < nfields; ++i)
        if (i != fTime && i != fThreads)
        {
            hdrLabels[j] = labels[i].name;
            hdrIndexes[j] = i;
            ++j;
        }
    hdr[0] = 0;
    hdr[sizeof(hdr)-1] = 0;
    bool first = true;
    for (int i = 0; i < nfields; ++i)
    {
        if (first)
            first = false;
        else
            strncat(hdr, ",", sizeof(hdr)-1);
        strncat(hdr, hdrLabels[i], sizeof(hdr)-1);
    }
    strncat(hdr, "\n", sizeof(hdr)-1);

    if (gbl.server)
    {
        setup_server();
    }
    else if (gbl.outfile)
    {
        outfile = fopen(gbl.outfile, "w");
        if (outfile == NULL)
            err(1, "create %s", gbl.outfile);
    }

    gbl.hdr[0] = 0;
    gbl.hdr[sizeof(gbl.hdr)-1] = 0;
    if (gbl.server)
    {
        for (int i = 0; i < nfields; ++i)
            if (i != fTime)
                snprintf(gbl.hdr+strlen(gbl.hdr), sizeof(gbl.hdr)-1,
                    "%s=%g,%s,%g\n",
                    labels[i].name,
                    labels[i].max,
                    labels[i].units,
                    labels[i].factor);
        strncat(gbl.hdr, hdr, sizeof(gbl.hdr)-1);
    }
    else if (outfile)
    {
        for (int i = 0; i < nfields; ++i)
            if (i != fTime)
                fprintf(outfile, "%s=%g,%s,%g\n",
                    labels[i].name,
                    labels[i].max,
                    labels[i].units,
                    labels[i].factor);
        fprintf(outfile, hdr);
    }
    starttsc = _rdtsc();
}
Пример #27
0
  }

  return;
}


static void
diffusion_mic(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,
              REAL ce, REAL cw, REAL cn, REAL cs, REAL ct,
              REAL cb, REAL cc, REAL dt, int count) {

  unsigned long (*pmc1)[2], (*pmc2)[2];
  unsigned long pmcs[2];
  unsigned long tsc;
  int nthreads;
  tsc = _rdtsc();
#pragma omp parallel
  {
    REAL *f1_t = f1;
    REAL *f2_t = f2;
    int mythread;

#if defined(PMU)
#pragma omp master
    {
    nthreads = omp_get_num_threads();
#if defined(PMU)
    pmc1 = malloc(nthreads * sizeof(pmc1[0]));
    pmc2 = malloc(nthreads * sizeof(pmc1[0]));
#endif
Пример #28
0
void 
main(int argc, char ** argv)
{
	double *a, *b, *c, xx=0.01, bw, avg_bw, best_bw=-1.0;
	char * buf1, *buf2, *buf3;
	int i,j,k,offset_a=0,offset_b=0,offset_c=0, mult=1,iter=100, c_val;
	int len,mem_level, level_size[4], cpu, cpu_run, bytes_per,scale;
	unsigned long long start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0;
	__pid_t pid=0;
	int cpu_setsize;
	cpu_set_t mask;


//	process input arguments

	if(argc < 3 ){
		printf("triad driver needs at least 3 arguments, cpu_init, cpu_run, cache_level, [call count multiplier  def = 1], [offset a, offset_b, offset_c  defaults = 0] \n");
		printf(" argc = %d\n",argc);
		usage();
		err(1, "bad arguments");
		}


	len = L4;
        while ((c_val = getopt(argc, argv, "i:r:l:m:a:b:c")) != -1) {
                switch(c_val) {
                case 'i':
                        cpu = atoi(optarg);
                        break;
                case 'r':
                        cpu_run = atoi(optarg);
                        break;
                case 'l':
                        mem_level = atoi(optarg);
                        break;
                case 'm':
                        mult = atoi(optarg);
                        break;
                case 'a':
                        offset_a = atoi(optarg);
                        break;
                case 'b':
                        offset_b = atoi(optarg);
                        break;
                case 'c':
                        offset_c = atoi(optarg);
                        break;
                default:
                        err(1, "unknown option %c", c_val);
                }
        }
        iter = iter*mult;



// pin core affinity for initialization
        if(pin_cpu(pid, cpu) == -1) {
                err(1,"failed to set affinity");
                }
        else{
                fprintf(stderr," process pinned to core %d for initialization\n",cpu);
                }


// set buffer sizes and loop tripcounts based on memory level
	level_size[0]=L1;
	level_size[1]=L2;
	level_size[2]=L3;
	level_size[3]=L4;
	fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult);
	len = level_size[mem_level]/32;
	fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult);
	scale = level_size[3]/(32*len);
	fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d, scale = %d\n",len, mem_level, iter,mult,scale);
	iter =iter*scale*mult;
	
	fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult);

// malloc and initialize buffers
	buf1 = malloc(sizeof(double)*len + 4096 + 1024);
	fprintf(stderr," buf1 = %p\n",buf1);
	buf1 = buf1 + (0x1000 - (unsigned int)buf1 & 0xFFF) + offset_a;
	fprintf(stderr," buf1 = %p\n",buf1);
	a = (double *) buf1;
	buf2 = malloc(sizeof(double)*len + 4096 + 1024);
	fprintf(stderr," buf2 = %p\n",buf2);
	buf2 = buf2 + (0x1000 - (unsigned int)buf2 & 0xFFF) + offset_b;
	fprintf(stderr," buf2 = %p\n",buf2);
	b = (double *) buf2;
	buf3 = malloc(sizeof(double)*len + 4096 + 1024);
	fprintf(stderr," buf3 = %p\n",buf3);
	buf3 = buf3 + (0x1000 - (unsigned int)buf3 & 0xFFF) + offset_c;
	fprintf(stderr," buf3 = %p\n",buf3);
	c = (double *) buf3;

	for(i=0;i<len;i++){
		a[i] = 0.;
		b[i] = 10.;
		c[i] = 10.;
		}

// pin core affinity for triad run
        if(pin_cpu(pid, cpu_run) == -1) {
                err(1,"failed to set affinity");
                }
        else{
                fprintf(stderr," process pinned to core %d for triad run\n",cpu_run);
                }

// run the triad
	printf(" calling triad %d times with len = %d\n",iter,len);
	call_start = _rdtsc();
	for(i=0;i<iter;i++){
		start = _rdtsc();
		bytes_per = triad(len,xx,a,b,c);
		stop = _rdtsc();
		run_time = stop - start;
		xx+=0.01;
		total_bytes +=len*bytes_per;
		bw=(double)(len*bytes_per)/(double)run_time;
		if(bw > best_bw) best_bw = bw;
		}
	call_stop = _rdtsc();
	call_run_time = call_stop - call_start;
	avg_bw=(double)(total_bytes)/(double)call_run_time;
//  printout
	printf(" transfering %lld bytes from memory level %d took %lld cycles/call and a total of %lld\n",total_bytes,mem_level,run_time,call_run_time);
	printf(" average bytes/cycle = %f\n", avg_bw);
	printf(" best bytes/cycle = %f\n",best_bw);
}
Пример #29
0
//******************************************************* MAIN *******************************
int main(int argc, char* argv[]){
    //set up sdl
    SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO);

    SDL_Window *Window = SDL_CreateWindow("Handmade Hero",
        SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
        640, 480, SDL_WINDOW_RESIZABLE);
    if(Window){
        SDL_Renderer *Renderer = SDL_CreateRenderer(Window, -1, 0);
    if(Renderer){

    // VIDEO
    sdl_offscreen_buffer Buffer = {}; // if non-initialized, declaring variables in a loop fails on SDLResizeTexture with a pointer error - im blaming the compiler
    sdl_window_dimension Dimension = SDLGetWindowDimension(Window);
    SDLResizeTexture(&Buffer, Renderer, Dimension.Width, Dimension.Height);
    keystates Keys = {};

    //AUDIO
    sdl_sound_output sound_output = sdl_sound_outputH(48000);
    //open audio
    SDLInitAudio(sound_output.SamplesPerSecond, sound_output.SamplesPerSecond * sound_output.BytesPerSample / 60);
    SDL_PauseAudio(0);

    //BW: state area allocation
    void *new_state_area = malloc(128*1024*1024);//need ~2 MB at least; give it 128 MiB -- 2MB for video; dont know audio
    void *prev_state_area = malloc(1024);// should be sizeof(state0), or sizeof(biggest statetype) later
    //apparently we didnt have enough memory, but only crashed sometimes? this fixed it
    void *state;
    
    {
        uint8 *next_ptr = (uint8*)prev_state_area;
        anim_comp *animation = (anim_comp*)next_ptr;
        next_ptr += sizeof(anim_comp);

        init_anim_comp(animation, 0,0);

        state0 *stateptr = (state0*)next_ptr;
        printf("state0 size %ld\n", sizeof(state0));
        uint64 stateptrn = (uint64)stateptr;
        uint64 sizeptr = (uint64)&(stateptr->size);
        uint64 deepc_ptr = (uint64)&(stateptr->deep_count);
        uint64 anim_ptr = (uint64)&(stateptr->animation);
        uint64 tsine_ptr = (uint64)&(stateptr->tSine);
        uint64 tvol_ptr = (uint64)&(stateptr->ToneVolume);
        uint64 thz_ptr = (uint64)&(stateptr->ToneHz);
        uint64 pu_ptr = (uint64)&(stateptr->pitch_up_was_pressed);
        printf("offset begin    %lu\n", sizeptr - stateptrn);
        printf("width of size   %lu\n", deepc_ptr - sizeptr);
        printf("width of deepc  %lu\n", anim_ptr - deepc_ptr);
        printf("width of animpt %lu\n", tsine_ptr - anim_ptr);
        printf("width of tsine  %lu\n", tvol_ptr - tsine_ptr);
        printf("width of tvol   %lu\n", thz_ptr - tvol_ptr);
        printf("width of thz    %lu\n", pu_ptr - thz_ptr);

        next_ptr += sizeof(state0);

        init_state0(stateptr, animation, 3000, 256, 0);

        state = stateptr;
        //return 0;
    }

    uint64 LastCounter = SDL_GetPerformanceCounter();
    uint64 LastCycleCount = _rdtsc();
    uint64 PerfCountFrequency = SDL_GetPerformanceFrequency();

    bool running = true;
    //main loop
    printf("enter main event loop\n");
    while(running){
        ///////NP_UPDATE/////////////
        //event capturing
        event_return events = eventHandler(&Keys);
        if(events.shouldQuit) running = false;
        //setup for p
        state_window_info Wi = {}; Wi.Height = Buffer.Height; Wi.Width = Buffer.Width; Wi.Pitch = Buffer.Pitch;
        int TargetQueueBytes = sound_output.LatencySampleCount * sound_output.BytesPerSample;
        state_sound_info Si = {}; Si.BytesToGet = TargetQueueBytes - SDL_GetQueuedAudioSize(1); Si.BytesPerSample = sound_output.BytesPerSample; Si.SamplesPerSecond = sound_output.SamplesPerSecond;
        uint64 state_size = 0;
        uint64 vbuffer_size = Buffer.Height * Buffer.Pitch;
        uint64 abuffer_size = Si.BytesToGet;
        state_return next;
        { //in case(statetype) or similar
            next = P_update_state0(*(state0*)state, new_state_area, Keys, Wi, Si);
            //p should return state_size? and also, what statetype we are in -> later
            state_size = sizeof(state0);
        }

        //GARBAGE COLLECTOR
        //move this state to previous state
        //fmemcpy(prev_state_area, new_state_area, state_size); //shallow copy, as supposed
        //printf("hi\n");
        deepcopy(prev_state_area, next.state, new_state_area, (uint8*)new_state_area + 128*1024*1024);
        //TODO(md): DEEPCPY

        //queue audio
        if (Si.BytesToGet > 0) SDL_QueueAudio(1, next.abuffer, abuffer_size);

        //render
        SDLUpdateWindow(Window, Renderer, Buffer, next.vbuffer);


        uint64 EndCycleCount = _rdtsc();
        uint64 EndCounter = SDL_GetPerformanceCounter();
        uint64 CounterElapsed = EndCounter - LastCounter;
        uint64 CyclesElapsed = EndCycleCount - LastCycleCount;

        real64 MSPerFrame = (((1000.0f * (real64)CounterElapsed) / (real64)PerfCountFrequency));
        real64 FPS = (real64)PerfCountFrequency / (real64)CounterElapsed;
        real64 MCPF = ((real64)CyclesElapsed / (1000.0f * 1000.0f));

        printf("%.02fms/f, %.02f/s, %.02fmc/f\n", MSPerFrame, FPS, MCPF);

        LastCycleCount = EndCycleCount;
        LastCounter = EndCounter;
    }
    } } //if(Renderer, Window)
    SDL_Quit();
    return 0;
}
Пример #30
0
int main(int argc, char ** argv)
{
	char * buf1;
	void * ret;
	size_t * array, ret_val = 0;
	size_t  array_stride;
	int i,j,k,cpu,cpu_run,line_count,stride, fd = -1;
	off_t offset = 0;
	int len=10240000, iter=100,mult=1,main_ret=0;
	double iterations;
	double *a, *b;
	size_t start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0;
	__pid_t pid=0;
	size_t buf_size,jj,zero_loop, buf_by_num_seg,ind;
	size_t num_pages, page_size, var_size;
	int cpu_setsize;
	cpu_set_t mask;
//	size_t pattern[] = {4,1,5,2,6,3,7,0};
	int *pattern;
	int step, c;
	int* index, lc_by_num_seg,count, num_seg=32, huge=0;
	unsigned int bitmask, *intstar;

	page_size = 4096;

//	process input arguments

	if(argc < 6){
		fprintf(stderr,"the random walker requires at least 6 arguments (only the 7th in the list below is optional), there were %d\n",argc);
		usage();
		err(1,"insufficient invocation arguments");
		}

	while ((c = getopt(argc, argv, "i:r:l:s:S:m:L")) != -1) {
		switch(c) {
		case 'i':
			cpu = atoi(optarg);
			break;
		case 'r':
			cpu_run = atoi(optarg);
			break;
		case 'l':
			line_count = atoi(optarg);
			break;
		case 's':
			stride = atoi(optarg);
			break;
		case 'S':
			num_seg = atoi(optarg);
			break;
		case 'm':
			mult = atoi(optarg);
			break;
		case 'L':
			huge=1;
			page_size = 2 * 1024 * 1024;
			break;
		default:
			err(1, "unknown option %c", c);
		}
	}
	iter = iter*mult;


	var_size = sizeof(size_t);
	fprintf(stderr, "size_t in %zd bytes\n",var_size);
// pin core affinity

	if(pin_cpu(pid, cpu) == -1) {
		err(1,"failed to set affinity");
		}
	else{
		fprintf(stderr," process pinned to core %d\n",cpu);
		}

	pattern = (int*) malloc(num_seg*sizeof(int));
	if(pattern == NULL)
		{
		fprintf(stderr," failed to malloc pattern for size = %d\n",num_seg);
		err(1,"malloc of pattern failed");
		}

// calculate stride and buffer size
	stride = page_size*stride + 64;
	buf_size = (size_t)line_count*(size_t)stride;
	num_pages = buf_size/page_size + 2;
	buf_size = page_size*num_pages;
	array_stride = stride/sizeof(double);
	iterations = (double)iter*(double)len;

//    create index array for "random" patterna
	index = (int*)malloc(line_count*sizeof(int));
	if(index == NULL)
		{
		fprintf(stderr," failed to malloc index array for line_count of %d\n",line_count);
		err(1,"failed to malloc index");
		}
	if(num_seg == 1)
		{
		for(i=0; i<line_count-1; i++)index[i] = i;
		}
	else
		{
		
//	fprintf(stderr," calling rndm_list, n = %d\n",num_seg);
		rndm_list(pattern,num_seg);
		lc_by_num_seg = line_count/num_seg;
		if(lc_by_num_seg*num_seg != line_count)
			{
			fprintf(stderr," line count must be a multiple of the fifth argument num_seg = %d\n", num_seg);
			err(1," bad line_count");
			}
		count=0;
		buf_by_num_seg = buf_size/num_seg;
		for(i=0; i<lc_by_num_seg; i++)
			{
			step = 0;
			for(j=0;j<num_seg;j++)
				{
				count++;
				if(j == (num_seg-1) ) step = 1;
				ind = lc_by_num_seg*pattern[j];
				index[count]= (int) ind + i + step;
				if(index[count] >= line_count)
					printf(" count = %d, index = %d\n",count,index[count]);
				}
			}
		}
	index[0] = 0;

	for(i=0; i<line_count; i++)index[i] = index[i]*array_stride;

// malloc and initialize buffers
//    replace malloc call with a call to mmap

	if(huge == 0)
	buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON , fd, offset);
	if(huge == 1)
	buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB , fd, offset);
	if(buf1 == MAP_FAILED)
		{
		fprintf(stderr,"mmap failed\n");
		err(1,"mmap failed");
		} 
	fprintf(stderr," buf1 for a = %p\n",buf1);
	a = (double*) buf1;

	if(huge == 0)
	buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON , fd, offset);
	if(huge == 1)
	buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB , fd, offset);
	if(buf1 == MAP_FAILED)
		{
		fprintf(stderr,"mmap failed\n");
		err(1,"mmap failed");
		} 
	fprintf(stderr," buf1 for b = %p\n",buf1);
	b = (double*)buf1;


	zero_loop = buf_size/sizeof(double);
	fprintf(stderr, " buf_size = %zu, zero_loop = %zu, array_stride = %zd\n",buf_size,zero_loop,array_stride);
	
	for(i=0; i<zero_loop; i++) a[i] = 0;
	for(i=0; i<zero_loop; i++) b[i] = 0;
	fprintf(stderr," finished zeroing buf for a, b\n");


// pin core affinity
	if(pin_cpu(pid, cpu_run) == -1) {
		err(1,"cannot set cpu run affinity");
		}
	else{
		printf(" process pinned to core %d to run\n",cpu_run);
		}

// run the walker
	printf(" calling walker %d times which loops  %d times on buffer of %d lines with a stride of %d, for a total size of %zu\n",iter,len,line_count,stride,buf_size);
	call_start = _rdtsc();
	for(i=0;i<iter;i++){
		start = _rdtsc();
		ret_val = reader(len,line_count,a,b,index);
//	fprintf(stderr, " retval = %ld\n",ret_val);
		stop = _rdtsc();
		run_time = stop - start;
		}
	call_stop = _rdtsc();
	call_run_time = call_stop - call_start;
	printf(" run time = %zd\n",call_run_time);

//  printout
	printf(" average cycles per iteration = %f\n", (double)call_run_time/iterations);
	return main_ret;
}