uint64_t bench_sender_multiple(int count) { tmc_spin_barrier_wait(&barrier); uint64_t data_array[STRIDE]; for (uint64_t i = 0; i < STRIDE; i++) { data_array[i] = i; } uint64_t start = get_cycle_count(); for (uint64_t i = 0; i < (count / STRIDE); i++) { while (unlikely(my_queue_enqueue_multiple(queue, data_array, STRIDE) != 0)) {} } uint64_t finish = get_cycle_count(); tmc_spin_barrier_wait(&barrier); return finish - start; }
uint64_t bench_sender(int count) { tmc_spin_barrier_wait(&barrier); uint64_t start = get_cycle_count(); for (uint64_t i = 0; i < count; i++) { while(unlikely(my_queue_enqueue(queue, i) != 0)) {} } uint64_t finish = get_cycle_count(); tmc_spin_barrier_wait(&barrier); return finish - start; }
uint64_t bench_receiver(int count) { uint64_t data; tmc_spin_barrier_wait(&barrier); uint64_t start = get_cycle_count(); for (uint64_t i = 0; i < count; i++) { while(unlikely(my_queue_dequeue(queue, &data) != 0)) {} } uint64_t finish = get_cycle_count(); tmc_spin_barrier_wait(&barrier); return finish - start; }
int main(void) { cpu_set_t cpus; int wr_cnt, wr_miss, drd_cnt, drd_miss; unsigned long start_cycles = get_cycle_count(); // Init cpus if (tmc_cpus_get_my_affinity(&cpus) != 0) { tmc_task_die("Failure in 'tmc_cpus_get_my_affinity()'."); } int num_cpus = tmc_cpus_count(&cpus); printf("cpus_count is: %i\n", num_cpus); // Setup Counters setup_all_counters(&cpus); unsigned long start_for = get_cycle_count(); unsigned long cycles[num_cpus]; unsigned int drd_cnts[num_cpus]; for (int i=0;i<num_cpus;i++) { if (tmc_cpus_set_my_cpu(tmc_cpus_find_nth_cpu(&cpus, i)) < 0) { tmc_task_die("failure in 'tmc_set_my_cpu'"); } read_counters(&wr_cnt, &wr_miss, &drd_cnt, &drd_miss); drd_cnts[i] = drd_cnt; cycles[i] = get_cycle_count(); } unsigned long end_for = get_cycle_count(); for (int i=1;i<num_cpus;i++) { unsigned long temp = cycles[i] - cycles[i-1]; printf("time between %i and %i is %lu\n", i-1, i, temp); printf("drd_cnt for tile %i was %i\n", i, drd_cnts[i]); } printf("Total cycles for-loop: %lu\n", end_for-start_for); return 0; }
int main(){ // const int M = 128; // const int MATRIX_SIZE = 16; int* M1 = (int*)aligned_malloc(sizeof(int)*M*K, 32); int* M2 = (int*)aligned_malloc(sizeof(int)*N*K, 32); int* M3 = (int*)aligned_malloc(sizeof(int)*M*N, 32); // int* M1 = (int*)malloc(sizeof(int)*M*K); // int* M2 = (int*)malloc(sizeof(int)*N*K); // int* M3 = (int*)malloc(sizeof(int)*M*N); // for(int i = 0; i < MATRIX_SIZE; i++){ // M1[i] = read_from_int(_M1[i]); // M2[i] = read_from_int(_M2[i]); // M3[i] = read_from_int(_M3[i]); // } for(int i = 0; i < M; i++){ for(int j = 0; j < K; j++){ M1[i*K+j] = 1; // M1[i*K+j] = read_from_int(1); } } for(int i = 0; i < K; i++){ for(int j = 0; j < N; j++){ M2[i*N+j] = 1; // M2[i*N+j] = read_from_int(1); } } for(int i = 0; i < M; i++){ for(int j = 0; j < N; j++){ M3[i*N+j] = 0; // M3[i*N+j] = read_from_int(0); } } // for (int bank = 0; bank < 8; bank++) { // int* address = (int*)(bank * 0x20); // loki_channel_flush_all_lines(1, address); // loki_channel_invalidate_all_lines(1, address); // } // for(int i = 0; i < MATRIX_SIZE; i++){ // M1[i] = readDouble(1); // M2[i] = readDouble(1); // M3[i] = readDouble(0.5); // } unsigned long cycle_count = get_cycle_count(); unsigned long instr_count = get_instruction_count(); // dgemm_nn(M, N, K, // M1, K,1, // M2, N,1, // M3, N,1); dgemv_nn(M, K, M1, K, M2, M3); cycle_count = get_cycle_count() - cycle_count; instr_count = get_instruction_count() - instr_count; fprintf(stderr, "takes %lu cycle to complete \n", cycle_count); fprintf(stderr, "takes %lu instructions \n", instr_count); for(int i = 0; i < M; i++){ // // fprintf(stderr, "%d \n", fix16_to_int(M3[i])); for(int j = 0; j < N; j++){ if(M3[i*N+j] != M) fprintf(stderr, "%d at %d, %d \n", M3[i*N+j], i, j); // fprintf(stderr, "%d ", fix8_to_int(M3[i*N+j])); // fprintf(stderr, "%d ", M3[i*N+j]); } // fprintf(stderr, "\n\n"); } }
inline ticks getticks_platf() { return get_cycle_count(); }
clock_t clock(void) { return get_cycle_count() / (CLOCK_HZ / CLOCKS_PER_SEC); }
static inline cycles_t cyclecount(void) { return get_cycle_count(); }
int runbiq(void) { #ifdef DEBUGIIR #ifdef MMDSP curr_count = get_cycle_count(); #endif #endif if (interp!=0) { if (phasein==0) { val=*inptr++; phasein=ratio-1; } else { val=0; phasein--; } } else { if (phasein==0) { val=*inptr++; phasein=ratio1-1; } else { val=0; phasein--; } } for (cell=0;cell<(int)numcells;cell++) { /*------------- numerator mac -----------*/ #ifdef MMDSP acc=wL_imul(val,coefptr[5*cell]); acc += wX_fmul(D[4*cell],coefptr[5*cell+1]); /* use fractional multiplication to get 2*(1st order coef) */ acc += wL_imul(D[4*cell+1],coefptr[5*cell+2]); #else acc=val*coefptr[5*cell]; acc+=D[4*cell]*coefptr[5*cell+1]*2; acc+=D[4*cell+1]*coefptr[5*cell+2]; #endif /*------------- denominator mac ---------*/ #ifdef MMDSP acc-= wX_fmul(D[4*cell+2],coefptr[5*cell+3]); /* use fractional multiplication to get 2*(1st order coef) */ acc-= wL_imul(D[4*cell+3],coefptr[5*cell+4]); #else acc-=D[4*cell+2]*coefptr[5*cell+3]*2; acc-=D[4*cell+3]*coefptr[5*cell+4]; #endif /*-------------- update delay line ------*/ D[4*cell+1]=D[4*cell]; D[4*cell]=val; D[4*cell+3]=D[4*cell+2]; #ifdef MMDSP acc=wX_msl(acc,1); /* double result to compensate for integer multiplication*/ val=waddr(acc,acc0); #else val=acc; #endif D[4*cell+2]=val; } /*------------- process output sample --------*/ if (interp!=0) { if (phaseout==0) { #ifdef MMDSP acc=wX_fmul(val,kout); acc= wX_msl(acc, shiftout); *resptr++=waddr(acc,acc0); #else acc=val*kout; acc = acc * (1<<shiftout); *resptr++=acc; #endif phaseout=ratio1-1; ret_val=1; } else { phaseout--; ret_val=0; } } else { if (phaseout==0) { #ifdef MMDSP acc=wX_fmul(val,kout_dec); acc= wX_msl(acc, shiftout); *resptr++=waddr(acc,acc0); #else acc=val*kout_dec; acc = acc * (1<<shiftout); *resptr++=acc; #endif phaseout=ratio-1; ret_val=1; } else { phaseout--; ret_val=0; } } /*---------------------------------------------*/ #ifdef DEBUGIIR #ifdef MMDSP new_count = get_cycle_count(); #endif #endif return(ret_val); }