extern "C" int
GOMP_OFFLOAD_get_num_devices (void)
{
  int res = _Offload_number_of_devices ();
  TRACE ("(): return %d", res);
  return res;
}
Exemple #2
0
CIntStatus_t CInt_offload_createBasisSet (BasisSet_t * _basis)
{
    CIntStatus_t status;
    int i;
    int mic_numdevs;

    status = CInt_createBasisSet (_basis);
    if (status != CINT_STATUS_SUCCESS)
    {
        return status;
    }


    mic_numdevs = _Offload_number_of_devices ();
    _basis[0]->mic_numdevs = mic_numdevs;
    for (i = 0; i < mic_numdevs; i++)
    {
        #pragma offload target(mic: i) \
                nocopy(basis_mic) out(status)
        {
            status = CInt_createBasisSet (&basis_mic);
        }
        if (status != CINT_STATUS_SUCCESS)
        {
            return CINT_STATUS_OFFLOAD_ERROR;
        }
    }

    return CINT_STATUS_SUCCESS;
}
void offload_init(int *myrank, int *mydevice){
    int i;
    intptr_t ptr;
    MYDEVICE = _Offload_number_of_devices();
    if(MYDEVICE){
        MYDEVICE = *myrank % _Offload_number_of_devices();
    }
    *mydevice = MYDEVICE;
    for(i = 0; i < MAXSTREAM; i++){
        STREAM[i] = 0;
    }
//  CURRENTSTREAM = STREAM;
    DBUFFER = DBUFFER_;
    #pragma offload_transfer target(mic:MYDEVICE) nocopy(DBUFFER_:alloc_if(1) free_if(0))
    WAIT = 1337;
    SYNC = true;
}
Exemple #4
0
void Kmeans::init_centroids(const MatrixXdRowMajor& data_points, int k)
{
    centroids = data_points.block(0,0,k,data_points.cols());
    membership = VectorXd::Zero(data_points.rows());
    points_per_centroid = VectorXd::Zero(k);

    #ifdef MIC
    omp_set_nested(true);
    mic_number_devices = _Offload_number_of_devices();
    std::cout << "Number of MICs on the system: " << mic_number_devices << std::endl;

    mic_number_streamings = 2;
    double mic_workload = 0.8;
	int mic_data_points_count = (data_points.rows() * (mic_workload/(double)mic_number_devices));
    mic_data_points_count -= mic_data_points_count % mic_number_streamings;
    host_initial_data_point = mic_number_devices * mic_data_points_count;
    mic_stream_membership_count = mic_data_points_count / mic_number_streamings;
    int membership_length = mic_stream_membership_count;
    int data_points_length = mic_data_points_count * data_points.cols();
	int centroids_length = k * data_points.cols();
    double * host_data_points = (double *) data_points.data();
    for (int i=0; i<mic_number_devices; i++)
    {
	   	std::cout << "MIC" << i << ": " << " data count " << mic_data_points_count << std::endl;
	    utils.tic("ALLOC");
	    #pragma offload_transfer target(mic:i)\
        nocopy(mic_stream_membership_a[0:membership_length]: ALLOC)\
        nocopy(mic_stream_membership_b[0:membership_length]: ALLOC)\
        nocopy(mic_data_points[0:data_points_length]: ALLOC)\
        nocopy(mic_centroids[0:centroids_length]: ALLOC)
	    std::cout << "MIC" << i << ": " << " memmory allocated in " << utils.toc("ALLOC") << "secs" << std::endl;
        #pragma offload_transfer target(mic:i)\
        in(host_data_points[0:data_points_length]: REUSE into(mic_data_points[0:data_points_length]))
        host_data_points += data_points_length;
    }
    #endif
}
int main(int argc, char **argv)
{
    FILE    *fp, *fp2;
    char    testName[32] = "PHI_TRANSFER_KEEP_NOAL_IN";
    int     micNum, tid;
    unsigned int i, j, size, localSize, NLOOP = NLOOP_PHI_MAX, NLOOP_PHI;
    unsigned int smin = MIN_PHI_SIZE, smed = MED_PHI_SIZE, smax = MAX_PHI_SIZE;
    double  *f0, *f0_noal, *f1, *f1_noal;
    double timeMin, tStart, tElapsed[NREPS];
    double tScale = USEC, bwScale = MB;
    double overhead, threshold_lo, threshold_hi;
    double tMin, tMax, tAvg, stdDev, bwMax, bwMin, bwAvg, bwDev;
    double UsedMem, localMax, msgBytes;
    double tMsg[NREPS], bwMsg[NREPS];

    // Identify number of MIC devices
    micNum = _Offload_number_of_devices();
    if( micNum == 0 ) fatalError( "No Xeon Phi devices found. Test Aborted." );

    // Check for user defined limits
    checkEnvPHI( &NLOOP, &smin, &smed, &smax );
    if( micNum == 1 ) UsedMem = (double)smax*sizeof(double);
    if( micNum == 2 ) UsedMem = (double)smax*2.0*sizeof(double);

    // Allocate and initialize test array
    srand( SEED );
    f0 = doubleVector( smax+1 );
    // This array is unaligned by exactly 8 bytes
    f0_noal = &f0[1];

    // Check timer overhead in seconds
    timerTest( &overhead, &threshold_lo, &threshold_hi );

    // Open output files and write headers
    fp  = fopen( "mic0_keep_noal_time_in.dat", "a" );
    fp2 = fopen( "mic0_keep_noal_bw_in.dat", "a" );
    printHeaders( fp, fp2, testName, UsedMem, overhead, threshold_lo );

    //================================================================
    // Single loop with minimum size to verify that inner loop length  
    // is long enough for the timings to be accurate                     
    //================================================================
    // Warmup processor with a large size exchange
    // Since we will be reusing we want to make sure this exchange uses smax
    #pragma offload_transfer target(mic:0) in( f0_noal : length(smax) ALLOC KEEP )
    // Test is current NLOOP is enough to capture fastest test cases
    tStart = benchTimer();
    for(j = 0; j < NLOOP; j++){
        #pragma offload_transfer target(mic:0) in( f0_noal : length(smin) REUSE KEEP )
    }
    timeMin = benchTimer() - tStart;
    resetInnerLoop( timeMin, threshold_lo, &NLOOP );
    // Let's save this info in case we have more than one Phi device
    NLOOP_PHI = NLOOP;

    //================================================================
    // Execute test for each requested size                  
    //================================================================
    localSize = smin;
    localMax  = 0.0;
    for( size = smin; size <= smax; size = size*2 ){

        // Copy array to Phi (read/write test)
        for( i = 0; i < NREPS; i++){
            tStart = benchTimer();
            for(j = 0; j < NLOOP; j++){
                #pragma offload_transfer target(mic:0) in( f0_noal : length(size) REUSE KEEP )
            }
            tElapsed[i] = benchTimer() - tStart;
        }
        msgBytes = (double)( size*sizeof(double));
        post_process( fp, fp2, threshold_hi, tElapsed, tScale, bwScale, size,
                      msgBytes, msgBytes, &NLOOP, &localMax, &localSize );
    }
    // Print completion message                 
    printSummary( fp2, testName, localMax, localSize );
    fclose( fp2 ); 
    fclose( fp );

    if( micNum == 2 ){

    // Allocate and initialize test array for second Phi coprocessor (mic:1)
    f1 = doubleVector(smax+1);
    f1_noal = &f1[1];

    // Open output files and write headers
    fp  = fopen( "mic1_keep_noal_time_in.dat", "a" );
    fp2 = fopen( "mic1_keep_noal_bw_in.dat", "a" );
    printHeaders( fp, fp2, testName, UsedMem, overhead, threshold_lo );

    //================================================================
    // Single loop with minimum size to verify that inner loop length  
    // is long enough for the timings to be accurate                     
    //================================================================
    // Warmup processor with a large size exchange
    // Since we will be reusing we want to make sure this exchanges uses smax
    #pragma offload_transfer target(mic:1) in( f1_noal : length(smax) ALLOC KEEP )
    // Reset innermost loop to safe value and local quantities to defaults
    NLOOP = NLOOP_PHI;
    localSize = smin;
    localMax  = 0.0;

   //================================================================
    // Execute test for each requested size                  
    //================================================================
    for( size = smin; size <= smax; size = size*2 ){

        // Copy array to Phi (read/write test)
        for( i = 0; i < NREPS; i++){
            tStart = benchTimer();
            for(j = 0; j < NLOOP; j++){
                #pragma offload_transfer target(mic:1) in( f1_noal : length(size) REUSE KEEP )
            }
            tElapsed[i] = benchTimer() - tStart;
        }
        msgBytes = (double)( size*sizeof(double));
        post_process( fp, fp2, threshold_hi, tElapsed, tScale, bwScale, size,
                      msgBytes, msgBytes, &NLOOP, &localMax, &localSize );
    }
    // Print completion message                 
    printSummary( fp2, testName, localMax, localSize );
    fclose( fp2 ); 
    fclose( fp );

    //------- TESTING SIMULTANEOUS DATA TRANSFER TO BOTH PHI DEVICES ------

    // Open output files and write headers
    fp  = fopen( "mic0+1_keep_noal_time_in.dat", "a" );
    fp2 = fopen( "mic0+1_keep_noal_bw_in.dat", "a" );
    printHeaders( fp, fp2, testName, UsedMem, overhead, threshold_lo );

    // Warmup processor with a medium size exchange
    #pragma offload_transfer target(mic:0) in( f0_noal : length(smed) REUSE KEEP )
    #pragma offload_transfer target(mic:1) in( f1_noal : length(smed) REUSE KEEP )
    // Reset innermost loop to safe value and local quantities to defaults
    NLOOP = NLOOP_PHI;
    localSize = smin;
    localMax  = 0.0;

    //================================================================
    // Execute test for each requested size                  
    //================================================================
    for( size = smin; size <= smax; size = size*2 ){

        for( i = 0; i < NREPS; i++){
            tStart = benchTimer();
            #pragma omp parallel private(j,tid) num_threads(2)
            {
                tid = omp_get_thread_num();
                if( tid == 0 ){
                    for(j = 0; j < NLOOP; j++){
                        #pragma offload_transfer target(mic:0) in( f0_noal : length(size) REUSE KEEP )
                    }
                }
                if( tid == 1 ){
                    for(j = 0; j < NLOOP; j++){
                        #pragma offload_transfer target(mic:1) in( f1_noal : length(size) REUSE KEEP )
                    }
                }
            }
            tElapsed[i] = 0.5*( benchTimer() - tStart );
        }
        msgBytes = (double)( size*sizeof(double));
        post_process( fp, fp2, threshold_hi, tElapsed, tScale, bwScale, size,
                      msgBytes, msgBytes, &NLOOP, &localMax, &localSize );
    }
    // Print completion message                 
    printSummary( fp2, testName, localMax, localSize );
    fclose( fp2 ); 
    fclose( fp );

    }

    free( f0 );
    if( micNum == 2 ) free( f1 );
    return 0;
}
int util_mic_get_num_devices_() {
  /* only smp master does call, then bcast */
#define SIZE_GROUP 256
  MPI_Group wgroup_handle,group_handle;
  MPI_Comm group_comm;
  int err,i,ranks[SIZE_GROUP];
  int my_smp_master=util_my_smp_master();
  int size_group=util_cgetppn();

  if(mic_get_num_initialized) {
    return num_mic_devs;
  }else{

  if(util_my_smp_index() == 0) {
   num_mic_devs=_Offload_number_of_devices();
#ifdef DEBUG
   char *myhostname = (char *) malloc (MAXGETHOSTNAME);
   if(num_mic_devs != DBG_NUM_DEVS){
   gethostname(myhostname, sizeof(myhostname) );
   printf(" me %d hostname %s num_mic_devs %d \n", GA_Nodeid(), myhostname, num_mic_devs);
   if(num_mic_devs != DBG_NUM_DEVS){
   num_mic_devs=2;
   printf(" me %d reset hostname %s set num_mic_devs %d \n", GA_Nodeid(), myhostname, num_mic_devs);
   //   GA_Error("wrong number of MIC devs", (long) num_mic_devs);
   }else{
   printf(" me %d 2nd try hostname %s correct num_mic_devs %d \n", GA_Nodeid(), myhostname, num_mic_devs);
   free(myhostname);
   }
   }
#endif
  }
  
    /*get world group handle to be used later */
    err=MPI_Comm_group(MPI_COMM_WORLD, &wgroup_handle);
    if (err != MPI_SUCCESS) {
      fprintf(stdout,"util_getppn: MPI_Comm_group failed\n");
      GA_Error("util_getppn error", 0L);
    }
    for (i=0; i< size_group; i++) ranks[i] = i + my_smp_master; 
    
    /* create new group of size size_group */
    err=MPI_Group_incl(wgroup_handle, size_group, ranks, &group_handle);
    if (err != MPI_SUCCESS) {
      fprintf(stdout,"util_micdevs: MPI_Group_incl failed\n");
      GA_Error("util_micdevs error", 0L);
      fflush(stdout);
    }
    
    /* Create new new communicator for the newly created group */
    err=MPI_Comm_create(MPI_COMM_WORLD, group_handle, &group_comm);
    if (err != MPI_SUCCESS) {
      fprintf(stdout,"util_micdevs: MPI_Comm_group failed\n");
      GA_Error("util_micdevs error", 0L);
    }
    

    
    err= MPI_Bcast(&num_mic_devs, 1, MPI_INT, 0, group_comm);
    if (err != MPI_SUCCESS) {
      fprintf(stdout,"util_mics: MPI_Bcast failed\n");
      fflush(stdout);
      GA_Error("util_mic_get_num_devices error", 0L);
    }

      /*flush group and comm*/
      err=MPI_Group_free(&group_handle);
      if (err != MPI_SUCCESS) {
	fprintf(stdout,"util_micdevs: MPI_Group_free failed\n");
	GA_Error("util_micdevs error", 0L);
      }
      
      err=MPI_Comm_free(&group_comm);
      if (err != MPI_SUCCESS) {
	fprintf(stdout,"util_micdevs: MPI_Comm_free failed\n");
	GA_Error("util_micdevs error", 0L);
      }

      mic_get_num_initialized = 1;
      return num_mic_devs;
  }
}