extern "C" int GOMP_OFFLOAD_get_num_devices (void) { int res = _Offload_number_of_devices (); TRACE ("(): return %d", res); return res; }
CIntStatus_t CInt_offload_createBasisSet (BasisSet_t * _basis) { CIntStatus_t status; int i; int mic_numdevs; status = CInt_createBasisSet (_basis); if (status != CINT_STATUS_SUCCESS) { return status; } mic_numdevs = _Offload_number_of_devices (); _basis[0]->mic_numdevs = mic_numdevs; for (i = 0; i < mic_numdevs; i++) { #pragma offload target(mic: i) \ nocopy(basis_mic) out(status) { status = CInt_createBasisSet (&basis_mic); } if (status != CINT_STATUS_SUCCESS) { return CINT_STATUS_OFFLOAD_ERROR; } } return CINT_STATUS_SUCCESS; }
void offload_init(int *myrank, int *mydevice){ int i; intptr_t ptr; MYDEVICE = _Offload_number_of_devices(); if(MYDEVICE){ MYDEVICE = *myrank % _Offload_number_of_devices(); } *mydevice = MYDEVICE; for(i = 0; i < MAXSTREAM; i++){ STREAM[i] = 0; } // CURRENTSTREAM = STREAM; DBUFFER = DBUFFER_; #pragma offload_transfer target(mic:MYDEVICE) nocopy(DBUFFER_:alloc_if(1) free_if(0)) WAIT = 1337; SYNC = true; }
void Kmeans::init_centroids(const MatrixXdRowMajor& data_points, int k) { centroids = data_points.block(0,0,k,data_points.cols()); membership = VectorXd::Zero(data_points.rows()); points_per_centroid = VectorXd::Zero(k); #ifdef MIC omp_set_nested(true); mic_number_devices = _Offload_number_of_devices(); std::cout << "Number of MICs on the system: " << mic_number_devices << std::endl; mic_number_streamings = 2; double mic_workload = 0.8; int mic_data_points_count = (data_points.rows() * (mic_workload/(double)mic_number_devices)); mic_data_points_count -= mic_data_points_count % mic_number_streamings; host_initial_data_point = mic_number_devices * mic_data_points_count; mic_stream_membership_count = mic_data_points_count / mic_number_streamings; int membership_length = mic_stream_membership_count; int data_points_length = mic_data_points_count * data_points.cols(); int centroids_length = k * data_points.cols(); double * host_data_points = (double *) data_points.data(); for (int i=0; i<mic_number_devices; i++) { std::cout << "MIC" << i << ": " << " data count " << mic_data_points_count << std::endl; utils.tic("ALLOC"); #pragma offload_transfer target(mic:i)\ nocopy(mic_stream_membership_a[0:membership_length]: ALLOC)\ nocopy(mic_stream_membership_b[0:membership_length]: ALLOC)\ nocopy(mic_data_points[0:data_points_length]: ALLOC)\ nocopy(mic_centroids[0:centroids_length]: ALLOC) std::cout << "MIC" << i << ": " << " memmory allocated in " << utils.toc("ALLOC") << "secs" << std::endl; #pragma offload_transfer target(mic:i)\ in(host_data_points[0:data_points_length]: REUSE into(mic_data_points[0:data_points_length])) host_data_points += data_points_length; } #endif }
int main(int argc, char **argv) { FILE *fp, *fp2; char testName[32] = "PHI_TRANSFER_KEEP_NOAL_IN"; int micNum, tid; unsigned int i, j, size, localSize, NLOOP = NLOOP_PHI_MAX, NLOOP_PHI; unsigned int smin = MIN_PHI_SIZE, smed = MED_PHI_SIZE, smax = MAX_PHI_SIZE; double *f0, *f0_noal, *f1, *f1_noal; double timeMin, tStart, tElapsed[NREPS]; double tScale = USEC, bwScale = MB; double overhead, threshold_lo, threshold_hi; double tMin, tMax, tAvg, stdDev, bwMax, bwMin, bwAvg, bwDev; double UsedMem, localMax, msgBytes; double tMsg[NREPS], bwMsg[NREPS]; // Identify number of MIC devices micNum = _Offload_number_of_devices(); if( micNum == 0 ) fatalError( "No Xeon Phi devices found. Test Aborted." ); // Check for user defined limits checkEnvPHI( &NLOOP, &smin, &smed, &smax ); if( micNum == 1 ) UsedMem = (double)smax*sizeof(double); if( micNum == 2 ) UsedMem = (double)smax*2.0*sizeof(double); // Allocate and initialize test array srand( SEED ); f0 = doubleVector( smax+1 ); // This array is unaligned by exactly 8 bytes f0_noal = &f0[1]; // Check timer overhead in seconds timerTest( &overhead, &threshold_lo, &threshold_hi ); // Open output files and write headers fp = fopen( "mic0_keep_noal_time_in.dat", "a" ); fp2 = fopen( "mic0_keep_noal_bw_in.dat", "a" ); printHeaders( fp, fp2, testName, UsedMem, overhead, threshold_lo ); //================================================================ // Single loop with minimum size to verify that inner loop length // is long enough for the timings to be accurate //================================================================ // Warmup processor with a large size exchange // Since we will be reusing we want to make sure this exchange uses smax #pragma offload_transfer target(mic:0) in( f0_noal : length(smax) ALLOC KEEP ) // Test is current NLOOP is enough to capture fastest test cases tStart = benchTimer(); for(j = 0; j < NLOOP; j++){ #pragma offload_transfer target(mic:0) in( f0_noal : length(smin) REUSE KEEP ) } timeMin = benchTimer() - tStart; resetInnerLoop( timeMin, threshold_lo, &NLOOP ); // Let's save this info in case we have more than one Phi device NLOOP_PHI = NLOOP; //================================================================ // Execute test for each requested size //================================================================ localSize = smin; localMax = 0.0; for( size = smin; size <= smax; size = size*2 ){ // Copy array to Phi (read/write test) for( i = 0; i < NREPS; i++){ tStart = benchTimer(); for(j = 0; j < NLOOP; j++){ #pragma offload_transfer target(mic:0) in( f0_noal : length(size) REUSE KEEP ) } tElapsed[i] = benchTimer() - tStart; } msgBytes = (double)( size*sizeof(double)); post_process( fp, fp2, threshold_hi, tElapsed, tScale, bwScale, size, msgBytes, msgBytes, &NLOOP, &localMax, &localSize ); } // Print completion message printSummary( fp2, testName, localMax, localSize ); fclose( fp2 ); fclose( fp ); if( micNum == 2 ){ // Allocate and initialize test array for second Phi coprocessor (mic:1) f1 = doubleVector(smax+1); f1_noal = &f1[1]; // Open output files and write headers fp = fopen( "mic1_keep_noal_time_in.dat", "a" ); fp2 = fopen( "mic1_keep_noal_bw_in.dat", "a" ); printHeaders( fp, fp2, testName, UsedMem, overhead, threshold_lo ); //================================================================ // Single loop with minimum size to verify that inner loop length // is long enough for the timings to be accurate //================================================================ // Warmup processor with a large size exchange // Since we will be reusing we want to make sure this exchanges uses smax #pragma offload_transfer target(mic:1) in( f1_noal : length(smax) ALLOC KEEP ) // Reset innermost loop to safe value and local quantities to defaults NLOOP = NLOOP_PHI; localSize = smin; localMax = 0.0; //================================================================ // Execute test for each requested size //================================================================ for( size = smin; size <= smax; size = size*2 ){ // Copy array to Phi (read/write test) for( i = 0; i < NREPS; i++){ tStart = benchTimer(); for(j = 0; j < NLOOP; j++){ #pragma offload_transfer target(mic:1) in( f1_noal : length(size) REUSE KEEP ) } tElapsed[i] = benchTimer() - tStart; } msgBytes = (double)( size*sizeof(double)); post_process( fp, fp2, threshold_hi, tElapsed, tScale, bwScale, size, msgBytes, msgBytes, &NLOOP, &localMax, &localSize ); } // Print completion message printSummary( fp2, testName, localMax, localSize ); fclose( fp2 ); fclose( fp ); //------- TESTING SIMULTANEOUS DATA TRANSFER TO BOTH PHI DEVICES ------ // Open output files and write headers fp = fopen( "mic0+1_keep_noal_time_in.dat", "a" ); fp2 = fopen( "mic0+1_keep_noal_bw_in.dat", "a" ); printHeaders( fp, fp2, testName, UsedMem, overhead, threshold_lo ); // Warmup processor with a medium size exchange #pragma offload_transfer target(mic:0) in( f0_noal : length(smed) REUSE KEEP ) #pragma offload_transfer target(mic:1) in( f1_noal : length(smed) REUSE KEEP ) // Reset innermost loop to safe value and local quantities to defaults NLOOP = NLOOP_PHI; localSize = smin; localMax = 0.0; //================================================================ // Execute test for each requested size //================================================================ for( size = smin; size <= smax; size = size*2 ){ for( i = 0; i < NREPS; i++){ tStart = benchTimer(); #pragma omp parallel private(j,tid) num_threads(2) { tid = omp_get_thread_num(); if( tid == 0 ){ for(j = 0; j < NLOOP; j++){ #pragma offload_transfer target(mic:0) in( f0_noal : length(size) REUSE KEEP ) } } if( tid == 1 ){ for(j = 0; j < NLOOP; j++){ #pragma offload_transfer target(mic:1) in( f1_noal : length(size) REUSE KEEP ) } } } tElapsed[i] = 0.5*( benchTimer() - tStart ); } msgBytes = (double)( size*sizeof(double)); post_process( fp, fp2, threshold_hi, tElapsed, tScale, bwScale, size, msgBytes, msgBytes, &NLOOP, &localMax, &localSize ); } // Print completion message printSummary( fp2, testName, localMax, localSize ); fclose( fp2 ); fclose( fp ); } free( f0 ); if( micNum == 2 ) free( f1 ); return 0; }
int util_mic_get_num_devices_() { /* only smp master does call, then bcast */ #define SIZE_GROUP 256 MPI_Group wgroup_handle,group_handle; MPI_Comm group_comm; int err,i,ranks[SIZE_GROUP]; int my_smp_master=util_my_smp_master(); int size_group=util_cgetppn(); if(mic_get_num_initialized) { return num_mic_devs; }else{ if(util_my_smp_index() == 0) { num_mic_devs=_Offload_number_of_devices(); #ifdef DEBUG char *myhostname = (char *) malloc (MAXGETHOSTNAME); if(num_mic_devs != DBG_NUM_DEVS){ gethostname(myhostname, sizeof(myhostname) ); printf(" me %d hostname %s num_mic_devs %d \n", GA_Nodeid(), myhostname, num_mic_devs); if(num_mic_devs != DBG_NUM_DEVS){ num_mic_devs=2; printf(" me %d reset hostname %s set num_mic_devs %d \n", GA_Nodeid(), myhostname, num_mic_devs); // GA_Error("wrong number of MIC devs", (long) num_mic_devs); }else{ printf(" me %d 2nd try hostname %s correct num_mic_devs %d \n", GA_Nodeid(), myhostname, num_mic_devs); free(myhostname); } } #endif } /*get world group handle to be used later */ err=MPI_Comm_group(MPI_COMM_WORLD, &wgroup_handle); if (err != MPI_SUCCESS) { fprintf(stdout,"util_getppn: MPI_Comm_group failed\n"); GA_Error("util_getppn error", 0L); } for (i=0; i< size_group; i++) ranks[i] = i + my_smp_master; /* create new group of size size_group */ err=MPI_Group_incl(wgroup_handle, size_group, ranks, &group_handle); if (err != MPI_SUCCESS) { fprintf(stdout,"util_micdevs: MPI_Group_incl failed\n"); GA_Error("util_micdevs error", 0L); fflush(stdout); } /* Create new new communicator for the newly created group */ err=MPI_Comm_create(MPI_COMM_WORLD, group_handle, &group_comm); if (err != MPI_SUCCESS) { fprintf(stdout,"util_micdevs: MPI_Comm_group failed\n"); GA_Error("util_micdevs error", 0L); } err= MPI_Bcast(&num_mic_devs, 1, MPI_INT, 0, group_comm); if (err != MPI_SUCCESS) { fprintf(stdout,"util_mics: MPI_Bcast failed\n"); fflush(stdout); GA_Error("util_mic_get_num_devices error", 0L); } /*flush group and comm*/ err=MPI_Group_free(&group_handle); if (err != MPI_SUCCESS) { fprintf(stdout,"util_micdevs: MPI_Group_free failed\n"); GA_Error("util_micdevs error", 0L); } err=MPI_Comm_free(&group_comm); if (err != MPI_SUCCESS) { fprintf(stdout,"util_micdevs: MPI_Comm_free failed\n"); GA_Error("util_micdevs error", 0L); } mic_get_num_initialized = 1; return num_mic_devs; } }