void omp_report_mask(){ int nthrds, thrd; //Thread info int ncpus, nel_set; static int ** proc_mask; int i,j, ierr; char * dummy; thrd = omp_get_thread_num(); nthrds = omp_get_num_threads(); ncpus = (int) sysconf(_SC_NPROCESSORS_ONLN); if(omp_get_num_procs() != ncpus){ printf("ERROR: ncpus_by_omp=%d, ncpus_sched=%d\n",omp_get_num_procs(),ncpus); exit(1); } #pragma omp single { proc_mask = malloc(sizeof(int*)*nthrds); for(i=0;i<nthrds;i++) proc_mask[i] = malloc(sizeof(int)*ncpus ); for(i=0;i<nthrds;i++) for(j=0;j<ncpus;j++) proc_mask[i][j] =0; } ierr = boundto(&nel_set,proc_mask[thrd]); #pragma omp barrier #pragma omp single { print_mask(1, dummy, 0, 0,0, ncpus, nthrds,1, proc_mask[thrd]); //print header for(thrd=0;thrd<nthrds;thrd++){ print_mask(0, dummy, 0, thrd,0, ncpus, nthrds,1, proc_mask[thrd]); } } }
int hybrid_report_mask(){ // General int i,j,ierr; int id, rid,tid; int in_mpi, in_omp; int thrd, nthrds; int ncpus, nel_set; // Mask storage static int ** omp_proc_mask; static int * omp_mask_pac; char *dummy; // MPI specific Variables int rank, nranks; MPI_Request *request; MPI_Status *status; static int multi_node = 0; static char *all_names; static int max_name_len; int name_len; char proc_name[MPI_MAX_PROCESSOR_NAME]; char l,p; int tpc; // hwthreads/core Maskopts opts; // get print_speed fast or slow (f|c); listing cores or SMT (c|s) p = opts.get_p(); l = opts.get_l(); tpc=get_threads_per_node(); // In MPI and parallel region ? MPI_Initialized(&in_mpi); in_omp = omp_in_parallel(); if(in_mpi == 0){ printf("ERROR: ***** Must call hybrid_report_mask() in MPI program. ***** \n"); exit(1); } // Get rank number & no of ranks via MPI MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); if(in_omp == 0){ if(rank == 0){ printf(" ***** When using 1 thread, Intel OpenMP MAY report " "\"not in a parallel region\" (Uh!)***** \n"); printf(" ***** Each row will only have a rank number (no \"0\" thread_id). \n"); printf("WARNING: ***** Unspecified results if hybrid_report_mask " "not called in parallel region of MPI code section. ***** \n"); } } thrd = omp_get_thread_num(); // thread id nthrds = omp_get_num_threads(); // Number of Threads // Get number of cpus (this gives no. // of cpu_ids in /proc/cpuinfo) ncpus = (int) sysconf(_SC_NPROCESSORS_ONLN); // Working only with MPI processes (masters) #pragma omp master { // Get a list of nodes from all ranks. MPI_Get_processor_name(proc_name,&name_len); MPI_Allreduce(&name_len, &max_name_len, 1,MPI_INT, MPI_MAX, MPI_COMM_WORLD); all_names = (char *) malloc(sizeof(int*)*nranks*(max_name_len+1)); MPI_Gather( proc_name, max_name_len+1 , MPI_CHAR, all_names, max_name_len+1, MPI_CHAR, 0, MPI_COMM_WORLD); // If multiple nodes, make muti_node non-zero. if(rank == 0){ for(id=0;id<nranks;id++){ if( strcmp(&all_names[id*(max_name_len+1)],&all_names[0]) ) multi_node++; } } // Create shared storage for masks (only master allocates) omp_proc_mask = (int **) malloc(sizeof(int*)*nthrds); for(i=0;i<nthrds;i++) omp_proc_mask[i] = (int * ) malloc(sizeof(int )*ncpus ); for(i=0;i<nthrds;i++) for(j=0;j<ncpus;j++) omp_proc_mask[i][j] =0; } #pragma omp barrier #pragma omp critical // (boundto -- may not be thread safe) ierr = boundto(&nel_set,omp_proc_mask[thrd]); #pragma omp barrier #pragma omp master { omp_mask_pac = (int *) malloc(sizeof(int)*nranks*nthrds*ncpus); // need packing space for mpi send/recv if(rank == 0){ request = (MPI_Request *) malloc(sizeof(MPI_Request)*nranks); status = (MPI_Status *) malloc(sizeof(MPI_Status )*nranks); print_mask(1, dummy, multi_node, 0, 0, ncpus, nranks,nthrds, omp_proc_mask[0],tpc,l); //print header fflush(stdout); for(tid=0;tid<nthrds;tid++){ print_mask(0, &all_names[tid*(max_name_len+1)], multi_node, 0,tid, ncpus, nranks,nthrds, omp_proc_mask[tid],tpc,l); } fflush(stdout); for(rid=1;rid<nranks;rid++){ // Receive other rank's packed mask arrays MPI_Irecv(&omp_mask_pac[rid*nthrds*ncpus], nthrds*ncpus, MPI_INT, rid, 99, MPI_COMM_WORLD, &request[rid-1]); } MPI_Waitall(nranks-1,&request[0],&status[0]); for(rid=1;rid<nranks;rid++){ // Print for each rank for(tid=0;tid<nthrds;tid++){ print_mask(0, &all_names[tid*(max_name_len+1)], multi_node, rid,tid, ncpus, nranks,nthrds, &omp_mask_pac[rid*nthrds*ncpus + tid*ncpus],tpc,l); if(p == 's') ierr=usleep(300000); } } if(nranks*nthrds > 50) print_mask(2, dummy, multi_node, 0, 0, ncpus, nranks,nthrds, omp_proc_mask[0],tpc,l); //print header fflush(stdout); } // end root printing else{ //all non-root ranks // Pack up the ranks' mask arrays (Uh, should have made one array from beginning!) for( tid=0;tid<nthrds;tid++){ for( id=0; id<ncpus; id++) omp_mask_pac[(tid*ncpus)+id] = omp_proc_mask[tid][id]; if(p == 's') ierr=usleep(300000); } // Send to root MPI_Send(omp_mask_pac, nthrds*ncpus, MPI_INT, 0, 99, MPI_COMM_WORLD); } // end non-root printing // Return allocated space for(i=0;i<nthrds;i++) free(omp_proc_mask[i]); free(omp_proc_mask); free(omp_mask_pac); if(rank == 0 ){ free(request); free(status);} free(all_names); } // end of Master #pragma omp barrier // JIC, what all threads leaving at the same time. }
int hybrid_report_mask(void){ int thrd, nthrds; int rank, nranks; static int multi_node = 0; int ncpus, nel_set; static int ** omp_proc_mask; static int * omp_mask_pac; char *dummy; char proc_name[MPI_MAX_PROCESSOR_NAME]; static char * all_names; int name_len; static int max_name_len; // General int i,j,ierr; int id, rid,tid; int in_mpi, in_omp; // Mask storage int ** proc_mask; static int * all_masks=0; MPI_Initialized(&in_mpi); in_omp = omp_in_parallel(); if(in_mpi != 0 && in_omp == 0){ // Get number of cpus (this gives no. of cpu_ids in /proc/cpuinfo) // Get rank number & no of ranks via MPI ncpus = (int) sysconf(_SC_NPROCESSORS_ONLN); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); // Create a 2-D array for mask // proc_mask[rank][ncpus] -- for simplicity, size is [ncpus][ncpus] // Thinking ahead for hybrid code. // zero out proc_mask[ncpus][ncpus] // I could have made proc_mask a single array (proc_mask[ncpus]); but didn't // This is a hold-over from the openmp version that holds everything for all threads. // For MPI I made a continguous collection array (all_masks). proc_mask = malloc(sizeof(int*)*ncpus); for(i=0;i<ncpus;i++) proc_mask[i] = malloc(sizeof(int)*ncpus); for(i=0;i<ncpus;i++) for(j=0;j<ncpus;j++) proc_mask[i][j] =0; all_masks = (int *) malloc(sizeof(int)*ncpus*ncpus); // get map for this processor ierr=boundto(&nel_set,proc_mask[rank]); // Gather information to rank 0 MPI_Gather( proc_mask[rank], ncpus, MPI_INT, all_masks, ncpus, MPI_INT, 0, MPI_COMM_WORLD); // Get a list of nodes from all ranks. MPI_Get_processor_name(proc_name,&name_len); MPI_Allreduce(&name_len, &max_name_len, 1,MPI_INT, MPI_MAX, MPI_COMM_WORLD); all_names = malloc(sizeof(int*)*nranks*(max_name_len+1)); MPI_Gather( proc_name, max_name_len+1 , MPI_CHAR, all_names, max_name_len+1, MPI_CHAR, 0, MPI_COMM_WORLD); // If multiple nodes, make muti_node not equal to 0. if(rank == 0) for(id=0;id<nranks;id++){ if( strcmp(&all_names[id*(max_name_len+1)],&all_names[0]) ) multi_node++; } } // End of Pure MPI part if(in_mpi != 0 && in_omp != 0){ if(all_masks == 0) { printf("ERROR: ***** You must call hybrid_report_mask() in a Pure MPI region first. ***** \n"); exit(1); } thrd = omp_get_thread_num(); nthrds = omp_get_num_threads(); ncpus = (int) sysconf(_SC_NPROCESSORS_ONLN); #pragma omp single { omp_proc_mask = malloc(sizeof(int*)*nthrds); for(i=0;i<nthrds;i++) omp_proc_mask[i] = malloc(sizeof(int)*ncpus ); for(i=0;i<nthrds;i++) for(j=0;j<ncpus;j++) omp_proc_mask[i][j] =0; } #pragma omp critical ierr = boundto(&nel_set,omp_proc_mask[thrd]); #pragma omp barrier MPI_Comm_size(MPI_COMM_WORLD, &nranks); MPI_Comm_rank(MPI_COMM_WORLD, &rank); #pragma omp master { omp_mask_pac = (int *) malloc(sizeof(int)*nranks*ncpus); // need packing space for mpi send/recv if(rank == 0){ print_mask(1, dummy, multi_node, 0, 0, ncpus, nthrds,nranks, omp_proc_mask[0]); //print header fflush(stdout); for(tid=0;tid<nthrds;tid++){ print_mask(0, &all_names[tid*(max_name_len+1)], multi_node, 0,tid, ncpus, nthrds,nranks, omp_proc_mask[tid]); } fflush(stdout); for(rid=1;rid<nranks;rid++){ // Receive other rank's packed mask arrays MPI_Recv(omp_mask_pac, nthrds*ncpus, MPI_INT, rid, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE); for(tid=0;tid<nthrds;tid++){ print_mask(0, &all_names[rid*(max_name_len+1)], multi_node, rid,tid, ncpus, nthrds,nranks, &omp_mask_pac[tid*ncpus]); } fflush(stdout); } // rank loop } // end root printing else{ //all other ranks // All non-root ranks send to root. for(rid=1;rid<nranks;rid++){ // Pack up the ranks' mask arrays (Uh, should have made one array from beginning!) for( tid=0;tid<nthrds;tid++){ for( id=0; id<ncpus; id++) omp_mask_pac[(tid*ncpus)+id] = omp_proc_mask[tid][id]; } // Send to root MPI_Send(omp_mask_pac, nthrds*ncpus, MPI_INT, 0, 99, MPI_COMM_WORLD); } //all other ranks } // end non-root printing MPI_Barrier(MPI_COMM_WORLD); } // end of Master #pragma omp barrier } // end of OpenMP part }