예제 #1
0
void my_papi_profile(void *(*kernel)(void *))
{
	my_papi_init();
	readCounters(events, 5, kernel);
	readCounters(events2, 3, kernel);
	readCounters(events3, 3, kernel);
	readCounters(events4, 3, kernel);
	readCounters(events5, 1, kernel);
	my_papi_finalize();
}
예제 #2
0
파일: ltu.c 프로젝트: juskoa/trigger
/*FGROUP SHM
rc: 0: ok
    1: error reading counters
Operation:
action: read all counters + temperature
*/
int readCNTS2SHM() {
w32 secs, mics;
if(shmcnts==NULL) return(1);
/* for(ix=0; ix<10; ix++) {
  shmcnts[ix]=ix;
  //printf("readCNTS2SHMix:%d\n",ix);
};
return(0); */
GetMicSec(&secs, &mics);
/*
ctpc[CSTART_SPEC+2]= vmer32(L2_ORBIT_READ);
readCounters(ctpc, NCOUNTERS, 0); readTVCounters(&ctpc[CSTART_SPEC+3]);
//printf("readctpcounters: readCounters ok\n");
ctpc[CSTART_SPEC]= secs; ctpc[CSTART_SPEC+1]= mics;
//ctpc[13]= getCounter(1,13); ctpc[CSTART_L1+5]= getCounter(2,5);
*/
readCounters(shmcnts, LTUNCOUNTERS, 0);  // LTU counters
shmcnts[temperaturerp]= ReadTemperature();
shmcnts[epochsecsrp]= secs;
shmcnts[epochmicsrp]= mics;
return(0);
}
예제 #3
0
int main(int argc, char *argv[]) {
  int i,j,k;
  char *ts1,*ts2;
  machineInformation currentMachine;
  counterSessionInfo session;
  double seconds = 0.0;

  // Set machine information from CounterHomeBrew.h
  currentMachine.cpu_model = CPU_MODEL;
  currentMachine.num_sockets = NUM_SOCKETS;
  currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET;
  currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET;
  currentMachine.num_cores = NUM_CORES;
  currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET;
  currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX;
  currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX;

  // NHM-EX
  session.core_gen_counter_num_used = 0;
  int32 core_event_numbers[] = {};
  int32 core_umasks[] = {};

  session.cbo_counter_num_used = 1;
  int32 cbo_event_numbers[] = {0x14};
  int32 cbo_umasks[] = {0x7};

  // JKT
  /*
  session.core_gen_counter_num_used = 5;
  int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1};
  int32 core_umasks[] = {0x80,0x10,0x02,0x01, 0x07};

  session.cbo_counter_num_used = 1;
  int32 cbo_event_numbers[] = {0x37};
  int32 cbo_umasks[] = {0xf};
  session.cbo_filter = 0x1f;
  */
  for (i = 0; i < session.core_gen_counter_num_used; i++) {
    session.core_event_numbers[i] = core_event_numbers[i];
    session.core_umasks[i] = core_umasks[i];
  }
  for (i = 0; i < session.cbo_counter_num_used; i++) {
    session.cbo_event_numbers[i] = cbo_event_numbers[i];
    session.cbo_umasks[i] = cbo_umasks[i];
  }

  int fd[NUM_CORES];

  // Arrays to hold counter data...
  counterData before;
  counterData after;

  // some data for doing a naive matmul to test flop counting...
  // initloop(N);
  
  uint64 min_iters = 1;
  double minRuntime = 10.0;
  int M = atoi(argv[1]);
  int K = atoi(argv[2]);
  double *A = NULL;
  double *b = NULL;
  double *c = NULL;
  //  posix_memalign((void**)A,64,M*K*sizeof(double));
  // posix_memalign((void**)B,64,K*N*sizeof(double));
  // posix_memalign((void**)C,64,M*N*sizeof(double));
  A = (double*) malloc( M * K * sizeof(double) );
  b = (double*) malloc( K * sizeof(double) );
  c = (double*) malloc( M * sizeof(double) );      
  fill( A, M * K );
  fill( b, K );
  fill( c, M );
  

 // open the msr files for each core on the machine
  for (i = 0; i < currentMachine.num_cores; i++)
    open_msr_file(i,&fd[i]);

  // warm up da caches...
  doNaiveMatVec(M,K, A, b, c);
  // Program the counters!!!
  int socketsProgrammed = 0;
  for (i = 0; i < currentMachine.num_cores; i++) {
    int currentCoreFD = fd[i];
    
    /* clear global control register before programming */
    stopCounters(i, currentCoreFD, &currentMachine, &session);

    /* set up the fixed counters on each core */
    programCoreFixedCounters(currentCoreFD);
    
    /* set up the general purpose registers for each core */
    programGeneralPurposeRegisters(currentCoreFD, &currentMachine, &session);

    /* Program the Uncore as desired...*/
    // Only program the first physical core on each socket. 
    // NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm.
 #if CPU_MODEL == JAKETOWN
    if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets)
#elif CPU_MODEL == NEHALEM_EX
    if (i < currentMachine.num_sockets && socketsProgrammed < currentMachine.num_sockets)
#elif CPU_MODEL == IVY_BRIDGE
    if (i < currentMachine.num_sockets && socketsProgrammed < currentMachine.num_sockets)
#endif
      {
	programUncoreCounters( currentCoreFD, &currentMachine, &session);
	socketsProgrammed++;
      }

    /* set global control register to active counters */
    //    startCounters( i, currentCoreFD, &currentMachine, &session);
  }
  
  uint64 num_iters;
  for (num_iters = min_iters; seconds < minRuntime; num_iters *=2) {
    if (num_iters != min_iters) {
      free(ts1);
      free(ts2);
    }
    sleep(5);
    seconds = 0.0;

    // start the programmed counters...
    for (i = 0; i < currentMachine.num_cores; i++)
      startCounters( i, fd[i], &currentMachine, &session);
    
    /* READ COUNTERS BEFORE STUFF */
    readCounters(fd,&currentMachine,&session, &before);
    ts1 = getTimeStamp();
    seconds = read_timer();

    /* DO STUFF */    
    for (i =0; i < num_iters; i++)
      doNaiveMatVec(M,K, A, b, c);
    /* END DOING STUFF */

    seconds = read_timer()-seconds;
    ts2 = getTimeStamp();

    /* READ COUNTERS AFTER STUFF */    
    for (i = 0; i < currentMachine.num_cores; i++)
      stopCounters(i,fd[i],&currentMachine, &session);
    
  }
  num_iters /= 2;

  readCounters(fd,&currentMachine,&session,&after);
  diffCounterData(&currentMachine, &session, &after, &before, &after);

  uint64 *coreSums;
  coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64));
  for (i = 0; i < currentMachine.num_sockets; i++) {
    for (j = 0; j < currentMachine.num_cores_per_socket; j++) {
      for (k = 0; k < session.core_gen_counter_num_used; k++)
	//        coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k];
        coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k];
    }
  }

  uint64 *sums;
  sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64));
  for (i = 0; i < currentMachine.num_sockets; i++) {
    for (j = 0; j < currentMachine.num_cbos; j++) {
      for (k = 0; k < session.cbo_counter_num_used; k++)
	//        sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k];
        sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k];
    }
  }

  // only print data from first socket and core
  printf("%s,%s,%"PRIu64",%d,%d,%d,%f,",ts1,ts2,num_iters,M,K,K,seconds/(double)num_iters);
    for (j = 0; j < session.core_gen_counter_num_used; j++)
      //      printf("%"PRIu64",",after.generalCore[0][j]);
      printf("%f,",coreSums[j]/(double)num_iters);
    for (j = 0; j < session.cbo_counter_num_used; j++)
      printf("%f,",sums[j]/(double)num_iters);
    printf("\n");

  free(sums);
  free(coreSums);

  // Stop counters, reset PMU, close msr files
  cleanup(fd,&currentMachine,&session);

    
  free(A);
  free(b);
  free(c);
  
  return 0;
}
예제 #4
0
int main(int argc, char *argv[]) {
  int i,j,k;
  machineInformation currentMachine;
  counterSessionInfo session;

  initializeCUDA();

  // Set machine information from CounterHomeBrew.h
  currentMachine.cpu_model = CPU_MODEL;
  currentMachine.num_sockets = NUM_SOCKETS;
  currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET;
  currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET;
  currentMachine.num_cores = NUM_CORES;
  currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET; // should multiply by NUM_SOCKETS???
  currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX;
  currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX;

  // Set session events, umasks and counters used
  //  int32 core_event_numbers[] = {FP_COMP_OPS_EXE_EVTNR,SIMD_FP_256_EVTNR,0x51,0xF1,0x80};
  // int32 core_umasks[] = {FP_COMP_OPS_EXE_SCALAR_DOUBLE_UMASK,SIMD_FP_256_PACKED_DOUBLE_UMASK,0x01, 0x07,0x01};

  session.core_gen_counter_num_used = 5;
  int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1};
  int32 core_umasks[] = {0x20,0x40,0x01,0x01, 0x07};

  session.cbo_counter_num_used = 1;
  int32 cbo_event_numbers[] = {0x37};
  int32 cbo_umasks[] = {0xf};
  session.cbo_filter = 0x1f;

  for (i = 0; i < session.core_gen_counter_num_used; i++) {
    session.core_event_numbers[i] = core_event_numbers[i];
    session.core_umasks[i] = core_umasks[i];
  }
  for (i = 0; i < session.cbo_counter_num_used; i++) {
    session.cbo_event_numbers[i] = cbo_event_numbers[i];
    session.cbo_umasks[i] = cbo_umasks[i];
  }

  int fd[NUM_CORES];

  // Arrays to hold counter data...
  counterData before;
  counterData after;

  // some data for doing a naive matmul to test flop counting...
  // initloop(N);
  

  // M,N,K are multiples of the block size....
  int gpuOuter = atoi(argv[1]);
  int gpuInner = atoi(argv[2]);
  int cpuInner = atoi(argv[3]);
  double minRuntime = atoi(argv[4]);
  int Md = atoi(argv[5])*block_size;
  int Nd = atoi(argv[6])*block_size;
  int Kd = atoi(argv[7])*block_size;
  int Mh = atoi(argv[8]);
  int Nh = atoi(argv[9]);
  int Kh = atoi(argv[10]);

  char *ts1,*ts2,*ts3,*ts4;
  char *ts5,*ts6,*ts7,*ts8;
  double fineTimeStamps[8];
  double gTime = 0.0;
  double cTime = 0.0;
  double seconds = 0.0;
  int num_iters;

  uint64 *coreSums;
  coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64));

  uint64 *sums;
  sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64));

  float *Atmp = NULL;
  float *Btmp = NULL;
  float *Ctmp = NULL;
  Atmp = (float*) malloc( Mh * Nh * sizeof(float) );
  Btmp = (float*) malloc( Nh * sizeof(float) );
  Ctmp = (float*) malloc( Mh * sizeof(float) );
  randomInit(Atmp,Mh*Nh);
  randomInit(Btmp,Nh);

  for (num_iters = cpuInner; seconds < minRuntime; num_iters *=2) {
    seconds = 0.0;	
    for (i =0; i < num_iters; i++)
      BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, Atmp,Mh, Btmp,1, 1, Ctmp,1 );
    seconds = read_timer()-seconds;
  }
  //  num_iters /= 2;

  free(Atmp);
  free(Btmp);
  free(Ctmp);

  int readyThreads = 0;
  #pragma omp parallel
  {
    int threadNum = omp_get_thread_num();
    int numThreads = omp_get_num_threads();
    assert(numThreads==2);
    if (threadNum == 0) {
      cudaError_t error;
      int memSizeA = sizeof(float)*Md*Nd;
      int memSizeB = sizeof(float)*Nd;
      int memSizeC = sizeof(float)*Md;
      
      float *Ahost,*Bhost,*Chost;
      // use pinned memory on the host for BW and asynch memory transfers..
      int flags = cudaHostAllocDefault;
      ts5 = getTimeStamp();
      fineTimeStamps[0] = read_timer();
      error = cudaHostAlloc((void**)&Ahost,memSizeA,flags);if (error != cudaSuccess){printf("cudaHostMalloc Ahost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaHostAlloc((void**)&Bhost,memSizeB,flags);if (error != cudaSuccess){printf("cudaHostMalloc Bhost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaHostAlloc((void**)&Chost,memSizeC,flags);if (error != cudaSuccess){printf("cudaHostMalloc Chost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      // set local arrays
      randomInit(Ahost,Md*Nd);
      randomInit(Bhost,Nd);

      // allocate device memory
      float *Adevice,*Bdevice,*Cdevice;
      error = cudaMalloc((void**)&Adevice,memSizeA); if (error != cudaSuccess){printf("cudaMalloc Adevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaMalloc((void**)&Bdevice,memSizeB); if (error != cudaSuccess){printf("cudaMalloc Bdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaMalloc((void**)&Cdevice,memSizeC); if (error != cudaSuccess){printf("cudaMalloc Cdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      fineTimeStamps[1] = read_timer();
      ts6 = getTimeStamp();
#pragma omp critical
      {
	readyThreads += 1;
      }
      //     fprintf(stderr,"Incremented ready GPU\n");
      while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 0: %d\n",readyThreads);};

      //#pragma omp single 
      //{
      cudaStream_t stream1;
      cudaStreamCreate ( &stream1) ;
      ts3 = getTimeStamp();
      fineTimeStamps[2] = read_timer();
      gTime = read_timer();
      for (int i = 0; i < gpuOuter; i++) 
	GPUsgemv(gpuInner,Md,Nd,Kd,Adevice,Bdevice,Cdevice,Ahost,Bhost,Chost,&stream1);
      cudaStreamSynchronize(stream1);
      gTime = read_timer() - gTime;
      fineTimeStamps[3] = read_timer();
      ts4 = getTimeStamp();
      cudaFreeHost(Ahost);
      cudaFreeHost(Bhost);
      cudaFreeHost(Chost);

    } else {
      //  uint64 min_iters = strtoull(argv[4],NULL,0);
      float *A = NULL;
      float *B = NULL;
      float *C = NULL;
      ts7 = getTimeStamp();
      fineTimeStamps[4] = read_timer();
      A = (float*) malloc( Mh * Nh * sizeof(float) );
      B = (float*) malloc( Nh * sizeof(float) );
      C = (float*) malloc( Mh * sizeof(float) );
      randomInit(A,Mh*Nh);
      randomInit(B,Nh);
      fineTimeStamps[5] = read_timer();
      ts8 = getTimeStamp();
#pragma omp critical
      {
	readyThreads += 1;
      }
      //   fprintf(stderr,"Incremented ready CPU\n");
      while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 1: %d\n",readyThreads);};
                  
      // open the msr files for each core on the machine
      for (i = 0; i < currentMachine.num_cores; i++)
	open_msr_file(i,&fd[i]);
      
      
      int socketsProgrammed = 0;
      for (i = 0; i < currentMachine.num_cores; i++) {
	int currentCoreFD = fd[i];
	
	stopCounters(i, currentCoreFD, &currentMachine, &session);
	programCoreFixedCounters(currentCoreFD);    
	programGeneralPurposeRegisters(currentCoreFD, &currentMachine, &session);
	
	/* Program the Uncore as desired...*/
	// Only program the first physical core on each socket. 
	// NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm.
	if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets) {
	  programUncoreCounters( currentCoreFD, &currentMachine, &session);
	  socketsProgrammed++;
	}
      }
      
      seconds = 0.0;
      
      // start the programmed counters...
      for (i = 0; i < currentMachine.num_cores; i++)
	startCounters( i, fd[i], &currentMachine, &session);
      
      /* READ COUNTERS BEFORE STUFF */
      readCounters(fd,&currentMachine,&session, &before);
      ts1 = getTimeStamp();
      fineTimeStamps[6] = read_timer();
      seconds = read_timer();
      
      /* DO STUFF */    
      for (i =0; i < num_iters; i++)
	BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, A,Mh, B,1, 1, C,1 );
      
      /* END DOING STUFF */
      
      seconds = read_timer()-seconds;
      fineTimeStamps[7] = read_timer();
      ts2 = getTimeStamp();
      
      /* READ COUNTERS AFTER STUFF */    
      for (i = 0; i < currentMachine.num_cores; i++)
	stopCounters(i,fd[i],&currentMachine, &session);
      
      //  printf("num_iters = %"PRIu64", runtime is %g\n",num_iters,seconds);
      
      readCounters(fd,&currentMachine,&session,&after);
      diffCounterData(&currentMachine, &session, &after, &before, &after);
      
      for (i = 0; i < currentMachine.num_sockets; i++) {
	//    printf("Socket %d\n",i);
	for (j = 0; j < currentMachine.num_cores_per_socket; j++) {
	  //   printf("%d,",j);
	  for (k = 0; k < session.core_gen_counter_num_used; k++){
	    //	printf("%"PRIu64",",after.generalCore[i*currentMachine.num_cores_per_socket + j][k]);
	    // bug in the indexing of the core sums???
	    //        coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k];
	    coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k];
	  }
	  //	printf("\n");
	}
      }
      
      for (i = 0; i < currentMachine.num_sockets; i++) {
	//	printf("%d,",i);
	for (j = 0; j < currentMachine.num_cbos; j++) {
	  //	  printf("%d,",j);
	  for (k = 0; k < session.cbo_counter_num_used; k++) {
	    //	    printf("%llu,",after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]);
	    // bug in the indexing of the core sums???
	    //        sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k];
	    sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k];
	  }
	}
      }
      //      printf("\n");
            
      // Stop counters, reset PMU, close msr files
      cleanup(fd,&currentMachine,&session);
      
      
      free(A);
      free(B);
      free(C);
    }
  } // end parallel region

  printf("%s,%s,%s,%s,%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%f,%f,%f,",ts7,ts8,ts1,ts2,ts5,ts6,ts3,ts4,Mh,Nh,Kh,Md/block_size,Nd/block_size,Kd/block_size,num_iters,gpuOuter,gpuInner,seconds,gTime,(float)(gpuOuter*(Md*Kd+Nd+Md))/16.0);
  for (int i = 0; i < 8; i++)
    printf("%f,",fineTimeStamps[i]);

  for (j = 0; j < session.core_gen_counter_num_used; j++)
    printf("%llu,",coreSums[j]);
  for (j = 0; j < session.cbo_counter_num_used; j++)
    if (j == session.cbo_counter_num_used-1)
      printf("%llu",sums[j]);
    else
      printf("%llu,",sums[j]);
  printf("\n");
  
  free(sums);
  free(coreSums);
  
  return 0;
}
예제 #5
0
/*****************************************************
 *  Timer 5 is used to provide the main system clock
 *  It generates an interrupt every millisecond
 *
 *  This ISR is where most of the actual work gets done.
 *  with the sensors running and the straights profiler
 *  active, this interrupt takes about 220us of which
 *  120us is processing the sensors
 *****************************************************/
void _ISR SYSTIM_INTERRUPT(void)
{
	int pidOut;
	unsigned char rxBytes;
	/* reset the interrupt flag */
	SYSTIM_IF = 0;
	//LED_ON;
	tickCount++;      
	millisecondCount--;
	commCount++;
	
	if(!GDO0)
	{
		rxBytes = CC2500_receive_packet();
		if(rxBytes>PACKET_LEN)
		{
			CC2500_idle_mode();
			CC2500_clear_rx_fifo();
			CC2500_clear_tx_fifo();
			CC2500_receive_mode();
			commEnabled = FALSE;
		}
		else
			commEnabled = TRUE;
		if(newPacket)
		{
			deassamble_packet();
			getFellowCoveredSqrs();
			my.obzClear = OBZ_clear();
			if(inOBZ(fellow.location))
			{
				LED_ON;
			}
			else
			{
				LED_OFF;
			}
			dataUpdated = TRUE;
			newPacket = FALSE;
			noCommCount = 0;
		}
		else
			noCommCount++;
	}
	if((commCount>=6)&&commEnabled)
	{
		assamble_packet();
		if(!GDO0)
		{
			CC2500_transmit_packet();
			commCount = 0;
		}
	}
	else if(commCount==4)
	{
		CC2500_idle_mode();
		__delay_us(1);
		CC2500_clear_rx_fifo();
		__delay_us(1);
		CC2500_clear_tx_fifo();
		__delay_us(1);
		CC2500_receive_mode();
	}
	if(!(tickCount&1))
		readCubeSensors();
	
	if(tickCount>300000L)
	{
		motorsOff();
		sensorsOff();
		stopSystemTimer();
		LED_ON;
	}
	doButtons();  
	readLineSensors();
	readCounters();
	doProfiler();
	
	pidOut = doPID( &left_PID_param);
	if( pidOut < -MOTORS_MAX_DC )
		pidOut = -MOTORS_MAX_DC;
	else if( pidOut > MOTORS_MAX_DC )
		pidOut = MOTORS_MAX_DC;
	motorsLeftSetDutyCycle(pidOut);
	
	pidOut = doPID( &right_PID_param);
	if( pidOut < -MOTORS_MAX_DC )
		pidOut = -MOTORS_MAX_DC;
	else if( pidOut > MOTORS_MAX_DC )
		pidOut = MOTORS_MAX_DC;
	motorsRightSetDutyCycle(pidOut);
	
	//LED_OFF;
	
}