void my_papi_profile(void *(*kernel)(void *)) { my_papi_init(); readCounters(events, 5, kernel); readCounters(events2, 3, kernel); readCounters(events3, 3, kernel); readCounters(events4, 3, kernel); readCounters(events5, 1, kernel); my_papi_finalize(); }
/*FGROUP SHM rc: 0: ok 1: error reading counters Operation: action: read all counters + temperature */ int readCNTS2SHM() { w32 secs, mics; if(shmcnts==NULL) return(1); /* for(ix=0; ix<10; ix++) { shmcnts[ix]=ix; //printf("readCNTS2SHMix:%d\n",ix); }; return(0); */ GetMicSec(&secs, &mics); /* ctpc[CSTART_SPEC+2]= vmer32(L2_ORBIT_READ); readCounters(ctpc, NCOUNTERS, 0); readTVCounters(&ctpc[CSTART_SPEC+3]); //printf("readctpcounters: readCounters ok\n"); ctpc[CSTART_SPEC]= secs; ctpc[CSTART_SPEC+1]= mics; //ctpc[13]= getCounter(1,13); ctpc[CSTART_L1+5]= getCounter(2,5); */ readCounters(shmcnts, LTUNCOUNTERS, 0); // LTU counters shmcnts[temperaturerp]= ReadTemperature(); shmcnts[epochsecsrp]= secs; shmcnts[epochmicsrp]= mics; return(0); }
int main(int argc, char *argv[]) { int i,j,k; char *ts1,*ts2; machineInformation currentMachine; counterSessionInfo session; double seconds = 0.0; // Set machine information from CounterHomeBrew.h currentMachine.cpu_model = CPU_MODEL; currentMachine.num_sockets = NUM_SOCKETS; currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET; currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET; currentMachine.num_cores = NUM_CORES; currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET; currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX; currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX; // NHM-EX session.core_gen_counter_num_used = 0; int32 core_event_numbers[] = {}; int32 core_umasks[] = {}; session.cbo_counter_num_used = 1; int32 cbo_event_numbers[] = {0x14}; int32 cbo_umasks[] = {0x7}; // JKT /* session.core_gen_counter_num_used = 5; int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1}; int32 core_umasks[] = {0x80,0x10,0x02,0x01, 0x07}; session.cbo_counter_num_used = 1; int32 cbo_event_numbers[] = {0x37}; int32 cbo_umasks[] = {0xf}; session.cbo_filter = 0x1f; */ for (i = 0; i < session.core_gen_counter_num_used; i++) { session.core_event_numbers[i] = core_event_numbers[i]; session.core_umasks[i] = core_umasks[i]; } for (i = 0; i < session.cbo_counter_num_used; i++) { session.cbo_event_numbers[i] = cbo_event_numbers[i]; session.cbo_umasks[i] = cbo_umasks[i]; } int fd[NUM_CORES]; // Arrays to hold counter data... counterData before; counterData after; // some data for doing a naive matmul to test flop counting... // initloop(N); uint64 min_iters = 1; double minRuntime = 10.0; int M = atoi(argv[1]); int K = atoi(argv[2]); double *A = NULL; double *b = NULL; double *c = NULL; // posix_memalign((void**)A,64,M*K*sizeof(double)); // posix_memalign((void**)B,64,K*N*sizeof(double)); // posix_memalign((void**)C,64,M*N*sizeof(double)); A = (double*) malloc( M * K * sizeof(double) ); b = (double*) malloc( K * sizeof(double) ); c = (double*) malloc( M * sizeof(double) ); fill( A, M * K ); fill( b, K ); fill( c, M ); // open the msr files for each core on the machine for (i = 0; i < currentMachine.num_cores; i++) open_msr_file(i,&fd[i]); // warm up da caches... doNaiveMatVec(M,K, A, b, c); // Program the counters!!! int socketsProgrammed = 0; for (i = 0; i < currentMachine.num_cores; i++) { int currentCoreFD = fd[i]; /* clear global control register before programming */ stopCounters(i, currentCoreFD, ¤tMachine, &session); /* set up the fixed counters on each core */ programCoreFixedCounters(currentCoreFD); /* set up the general purpose registers for each core */ programGeneralPurposeRegisters(currentCoreFD, ¤tMachine, &session); /* Program the Uncore as desired...*/ // Only program the first physical core on each socket. // NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm. #if CPU_MODEL == JAKETOWN if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets) #elif CPU_MODEL == NEHALEM_EX if (i < currentMachine.num_sockets && socketsProgrammed < currentMachine.num_sockets) #elif CPU_MODEL == IVY_BRIDGE if (i < currentMachine.num_sockets && socketsProgrammed < currentMachine.num_sockets) #endif { programUncoreCounters( currentCoreFD, ¤tMachine, &session); socketsProgrammed++; } /* set global control register to active counters */ // startCounters( i, currentCoreFD, ¤tMachine, &session); } uint64 num_iters; for (num_iters = min_iters; seconds < minRuntime; num_iters *=2) { if (num_iters != min_iters) { free(ts1); free(ts2); } sleep(5); seconds = 0.0; // start the programmed counters... for (i = 0; i < currentMachine.num_cores; i++) startCounters( i, fd[i], ¤tMachine, &session); /* READ COUNTERS BEFORE STUFF */ readCounters(fd,¤tMachine,&session, &before); ts1 = getTimeStamp(); seconds = read_timer(); /* DO STUFF */ for (i =0; i < num_iters; i++) doNaiveMatVec(M,K, A, b, c); /* END DOING STUFF */ seconds = read_timer()-seconds; ts2 = getTimeStamp(); /* READ COUNTERS AFTER STUFF */ for (i = 0; i < currentMachine.num_cores; i++) stopCounters(i,fd[i],¤tMachine, &session); } num_iters /= 2; readCounters(fd,¤tMachine,&session,&after); diffCounterData(¤tMachine, &session, &after, &before, &after); uint64 *coreSums; coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64)); for (i = 0; i < currentMachine.num_sockets; i++) { for (j = 0; j < currentMachine.num_cores_per_socket; j++) { for (k = 0; k < session.core_gen_counter_num_used; k++) // coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; } } uint64 *sums; sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64)); for (i = 0; i < currentMachine.num_sockets; i++) { for (j = 0; j < currentMachine.num_cbos; j++) { for (k = 0; k < session.cbo_counter_num_used; k++) // sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; } } // only print data from first socket and core printf("%s,%s,%"PRIu64",%d,%d,%d,%f,",ts1,ts2,num_iters,M,K,K,seconds/(double)num_iters); for (j = 0; j < session.core_gen_counter_num_used; j++) // printf("%"PRIu64",",after.generalCore[0][j]); printf("%f,",coreSums[j]/(double)num_iters); for (j = 0; j < session.cbo_counter_num_used; j++) printf("%f,",sums[j]/(double)num_iters); printf("\n"); free(sums); free(coreSums); // Stop counters, reset PMU, close msr files cleanup(fd,¤tMachine,&session); free(A); free(b); free(c); return 0; }
int main(int argc, char *argv[]) { int i,j,k; machineInformation currentMachine; counterSessionInfo session; initializeCUDA(); // Set machine information from CounterHomeBrew.h currentMachine.cpu_model = CPU_MODEL; currentMachine.num_sockets = NUM_SOCKETS; currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET; currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET; currentMachine.num_cores = NUM_CORES; currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET; // should multiply by NUM_SOCKETS??? currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX; currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX; // Set session events, umasks and counters used // int32 core_event_numbers[] = {FP_COMP_OPS_EXE_EVTNR,SIMD_FP_256_EVTNR,0x51,0xF1,0x80}; // int32 core_umasks[] = {FP_COMP_OPS_EXE_SCALAR_DOUBLE_UMASK,SIMD_FP_256_PACKED_DOUBLE_UMASK,0x01, 0x07,0x01}; session.core_gen_counter_num_used = 5; int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1}; int32 core_umasks[] = {0x20,0x40,0x01,0x01, 0x07}; session.cbo_counter_num_used = 1; int32 cbo_event_numbers[] = {0x37}; int32 cbo_umasks[] = {0xf}; session.cbo_filter = 0x1f; for (i = 0; i < session.core_gen_counter_num_used; i++) { session.core_event_numbers[i] = core_event_numbers[i]; session.core_umasks[i] = core_umasks[i]; } for (i = 0; i < session.cbo_counter_num_used; i++) { session.cbo_event_numbers[i] = cbo_event_numbers[i]; session.cbo_umasks[i] = cbo_umasks[i]; } int fd[NUM_CORES]; // Arrays to hold counter data... counterData before; counterData after; // some data for doing a naive matmul to test flop counting... // initloop(N); // M,N,K are multiples of the block size.... int gpuOuter = atoi(argv[1]); int gpuInner = atoi(argv[2]); int cpuInner = atoi(argv[3]); double minRuntime = atoi(argv[4]); int Md = atoi(argv[5])*block_size; int Nd = atoi(argv[6])*block_size; int Kd = atoi(argv[7])*block_size; int Mh = atoi(argv[8]); int Nh = atoi(argv[9]); int Kh = atoi(argv[10]); char *ts1,*ts2,*ts3,*ts4; char *ts5,*ts6,*ts7,*ts8; double fineTimeStamps[8]; double gTime = 0.0; double cTime = 0.0; double seconds = 0.0; int num_iters; uint64 *coreSums; coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64)); uint64 *sums; sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64)); float *Atmp = NULL; float *Btmp = NULL; float *Ctmp = NULL; Atmp = (float*) malloc( Mh * Nh * sizeof(float) ); Btmp = (float*) malloc( Nh * sizeof(float) ); Ctmp = (float*) malloc( Mh * sizeof(float) ); randomInit(Atmp,Mh*Nh); randomInit(Btmp,Nh); for (num_iters = cpuInner; seconds < minRuntime; num_iters *=2) { seconds = 0.0; for (i =0; i < num_iters; i++) BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, Atmp,Mh, Btmp,1, 1, Ctmp,1 ); seconds = read_timer()-seconds; } // num_iters /= 2; free(Atmp); free(Btmp); free(Ctmp); int readyThreads = 0; #pragma omp parallel { int threadNum = omp_get_thread_num(); int numThreads = omp_get_num_threads(); assert(numThreads==2); if (threadNum == 0) { cudaError_t error; int memSizeA = sizeof(float)*Md*Nd; int memSizeB = sizeof(float)*Nd; int memSizeC = sizeof(float)*Md; float *Ahost,*Bhost,*Chost; // use pinned memory on the host for BW and asynch memory transfers.. int flags = cudaHostAllocDefault; ts5 = getTimeStamp(); fineTimeStamps[0] = read_timer(); error = cudaHostAlloc((void**)&Ahost,memSizeA,flags);if (error != cudaSuccess){printf("cudaHostMalloc Ahost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaHostAlloc((void**)&Bhost,memSizeB,flags);if (error != cudaSuccess){printf("cudaHostMalloc Bhost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaHostAlloc((void**)&Chost,memSizeC,flags);if (error != cudaSuccess){printf("cudaHostMalloc Chost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} // set local arrays randomInit(Ahost,Md*Nd); randomInit(Bhost,Nd); // allocate device memory float *Adevice,*Bdevice,*Cdevice; error = cudaMalloc((void**)&Adevice,memSizeA); if (error != cudaSuccess){printf("cudaMalloc Adevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMalloc((void**)&Bdevice,memSizeB); if (error != cudaSuccess){printf("cudaMalloc Bdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMalloc((void**)&Cdevice,memSizeC); if (error != cudaSuccess){printf("cudaMalloc Cdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} fineTimeStamps[1] = read_timer(); ts6 = getTimeStamp(); #pragma omp critical { readyThreads += 1; } // fprintf(stderr,"Incremented ready GPU\n"); while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 0: %d\n",readyThreads);}; //#pragma omp single //{ cudaStream_t stream1; cudaStreamCreate ( &stream1) ; ts3 = getTimeStamp(); fineTimeStamps[2] = read_timer(); gTime = read_timer(); for (int i = 0; i < gpuOuter; i++) GPUsgemv(gpuInner,Md,Nd,Kd,Adevice,Bdevice,Cdevice,Ahost,Bhost,Chost,&stream1); cudaStreamSynchronize(stream1); gTime = read_timer() - gTime; fineTimeStamps[3] = read_timer(); ts4 = getTimeStamp(); cudaFreeHost(Ahost); cudaFreeHost(Bhost); cudaFreeHost(Chost); } else { // uint64 min_iters = strtoull(argv[4],NULL,0); float *A = NULL; float *B = NULL; float *C = NULL; ts7 = getTimeStamp(); fineTimeStamps[4] = read_timer(); A = (float*) malloc( Mh * Nh * sizeof(float) ); B = (float*) malloc( Nh * sizeof(float) ); C = (float*) malloc( Mh * sizeof(float) ); randomInit(A,Mh*Nh); randomInit(B,Nh); fineTimeStamps[5] = read_timer(); ts8 = getTimeStamp(); #pragma omp critical { readyThreads += 1; } // fprintf(stderr,"Incremented ready CPU\n"); while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 1: %d\n",readyThreads);}; // open the msr files for each core on the machine for (i = 0; i < currentMachine.num_cores; i++) open_msr_file(i,&fd[i]); int socketsProgrammed = 0; for (i = 0; i < currentMachine.num_cores; i++) { int currentCoreFD = fd[i]; stopCounters(i, currentCoreFD, ¤tMachine, &session); programCoreFixedCounters(currentCoreFD); programGeneralPurposeRegisters(currentCoreFD, ¤tMachine, &session); /* Program the Uncore as desired...*/ // Only program the first physical core on each socket. // NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm. if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets) { programUncoreCounters( currentCoreFD, ¤tMachine, &session); socketsProgrammed++; } } seconds = 0.0; // start the programmed counters... for (i = 0; i < currentMachine.num_cores; i++) startCounters( i, fd[i], ¤tMachine, &session); /* READ COUNTERS BEFORE STUFF */ readCounters(fd,¤tMachine,&session, &before); ts1 = getTimeStamp(); fineTimeStamps[6] = read_timer(); seconds = read_timer(); /* DO STUFF */ for (i =0; i < num_iters; i++) BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, A,Mh, B,1, 1, C,1 ); /* END DOING STUFF */ seconds = read_timer()-seconds; fineTimeStamps[7] = read_timer(); ts2 = getTimeStamp(); /* READ COUNTERS AFTER STUFF */ for (i = 0; i < currentMachine.num_cores; i++) stopCounters(i,fd[i],¤tMachine, &session); // printf("num_iters = %"PRIu64", runtime is %g\n",num_iters,seconds); readCounters(fd,¤tMachine,&session,&after); diffCounterData(¤tMachine, &session, &after, &before, &after); for (i = 0; i < currentMachine.num_sockets; i++) { // printf("Socket %d\n",i); for (j = 0; j < currentMachine.num_cores_per_socket; j++) { // printf("%d,",j); for (k = 0; k < session.core_gen_counter_num_used; k++){ // printf("%"PRIu64",",after.generalCore[i*currentMachine.num_cores_per_socket + j][k]); // bug in the indexing of the core sums??? // coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; } // printf("\n"); } } for (i = 0; i < currentMachine.num_sockets; i++) { // printf("%d,",i); for (j = 0; j < currentMachine.num_cbos; j++) { // printf("%d,",j); for (k = 0; k < session.cbo_counter_num_used; k++) { // printf("%llu,",after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]); // bug in the indexing of the core sums??? // sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; } } } // printf("\n"); // Stop counters, reset PMU, close msr files cleanup(fd,¤tMachine,&session); free(A); free(B); free(C); } } // end parallel region printf("%s,%s,%s,%s,%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%f,%f,%f,",ts7,ts8,ts1,ts2,ts5,ts6,ts3,ts4,Mh,Nh,Kh,Md/block_size,Nd/block_size,Kd/block_size,num_iters,gpuOuter,gpuInner,seconds,gTime,(float)(gpuOuter*(Md*Kd+Nd+Md))/16.0); for (int i = 0; i < 8; i++) printf("%f,",fineTimeStamps[i]); for (j = 0; j < session.core_gen_counter_num_used; j++) printf("%llu,",coreSums[j]); for (j = 0; j < session.cbo_counter_num_used; j++) if (j == session.cbo_counter_num_used-1) printf("%llu",sums[j]); else printf("%llu,",sums[j]); printf("\n"); free(sums); free(coreSums); return 0; }
/***************************************************** * Timer 5 is used to provide the main system clock * It generates an interrupt every millisecond * * This ISR is where most of the actual work gets done. * with the sensors running and the straights profiler * active, this interrupt takes about 220us of which * 120us is processing the sensors *****************************************************/ void _ISR SYSTIM_INTERRUPT(void) { int pidOut; unsigned char rxBytes; /* reset the interrupt flag */ SYSTIM_IF = 0; //LED_ON; tickCount++; millisecondCount--; commCount++; if(!GDO0) { rxBytes = CC2500_receive_packet(); if(rxBytes>PACKET_LEN) { CC2500_idle_mode(); CC2500_clear_rx_fifo(); CC2500_clear_tx_fifo(); CC2500_receive_mode(); commEnabled = FALSE; } else commEnabled = TRUE; if(newPacket) { deassamble_packet(); getFellowCoveredSqrs(); my.obzClear = OBZ_clear(); if(inOBZ(fellow.location)) { LED_ON; } else { LED_OFF; } dataUpdated = TRUE; newPacket = FALSE; noCommCount = 0; } else noCommCount++; } if((commCount>=6)&&commEnabled) { assamble_packet(); if(!GDO0) { CC2500_transmit_packet(); commCount = 0; } } else if(commCount==4) { CC2500_idle_mode(); __delay_us(1); CC2500_clear_rx_fifo(); __delay_us(1); CC2500_clear_tx_fifo(); __delay_us(1); CC2500_receive_mode(); } if(!(tickCount&1)) readCubeSensors(); if(tickCount>300000L) { motorsOff(); sensorsOff(); stopSystemTimer(); LED_ON; } doButtons(); readLineSensors(); readCounters(); doProfiler(); pidOut = doPID( &left_PID_param); if( pidOut < -MOTORS_MAX_DC ) pidOut = -MOTORS_MAX_DC; else if( pidOut > MOTORS_MAX_DC ) pidOut = MOTORS_MAX_DC; motorsLeftSetDutyCycle(pidOut); pidOut = doPID( &right_PID_param); if( pidOut < -MOTORS_MAX_DC ) pidOut = -MOTORS_MAX_DC; else if( pidOut > MOTORS_MAX_DC ) pidOut = MOTORS_MAX_DC; motorsRightSetDutyCycle(pidOut); //LED_OFF; }