Beispiel #1
0
void omp_report_mask(){

int nthrds, thrd;  //Thread info

int ncpus, nel_set;
static int ** proc_mask;
int i,j, ierr;
char *  dummy;

   thrd   =  omp_get_thread_num();
   nthrds =  omp_get_num_threads();
   ncpus  =  (int) sysconf(_SC_NPROCESSORS_ONLN);
   

   if(omp_get_num_procs() != ncpus){
         printf("ERROR: ncpus_by_omp=%d, ncpus_sched=%d\n",omp_get_num_procs(),ncpus);
         exit(1);
   }


   #pragma omp single
   {
      proc_mask =                           malloc(sizeof(int*)*nthrds);
      for(i=0;i<nthrds;i++) proc_mask[i] =  malloc(sizeof(int)*ncpus  );
      for(i=0;i<nthrds;i++) for(j=0;j<ncpus;j++) proc_mask[i][j] =0;
   }

   ierr = boundto(&nel_set,proc_mask[thrd]);

   #pragma omp barrier
   #pragma omp single
   {
          print_mask(1,  dummy,  0,     0,0,   ncpus, nthrds,1, proc_mask[thrd]);  //print header
      for(thrd=0;thrd<nthrds;thrd++){
         print_mask(0,   dummy,  0,  thrd,0,   ncpus, nthrds,1, proc_mask[thrd]);
      }
   }
   
}
Beispiel #2
0
int hybrid_report_mask(){

                        // General
int i,j,ierr;
int id, rid,tid;
int in_mpi, in_omp;
int thrd, nthrds;
int ncpus, nel_set;

                        // Mask storage
static int ** omp_proc_mask;
static int  * omp_mask_pac;
char *dummy;

                       // MPI specific Variables
int rank, nranks;
MPI_Request *request; 
MPI_Status  *status;

static int   multi_node = 0;
static char *all_names;
static int   max_name_len;
     int   name_len;
     char  proc_name[MPI_MAX_PROCESSOR_NAME];

char l,p;
int  tpc;   // hwthreads/core

   Maskopts opts;
                          // get print_speed fast or slow (f|c);   listing cores or SMT (c|s)
   p = opts.get_p();
   l = opts.get_l();

   tpc=get_threads_per_node();
                                         // In MPI and parallel region ?
   MPI_Initialized(&in_mpi);
   in_omp = omp_in_parallel();
   if(in_mpi == 0){
     printf("ERROR: ***** Must call hybrid_report_mask() in MPI program. ***** \n");
     exit(1);
   }

                                        // Get rank number & no of ranks via MPI
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   MPI_Comm_size(MPI_COMM_WORLD, &nranks);

   if(in_omp == 0){
     if(rank == 0){
        printf("         ***** When using 1 thread, Intel OpenMP MAY report "
                              "\"not in a parallel region\" (Uh!)***** \n");
        printf("         ***** Each row will only have a rank number (no \"0\" thread_id). \n");
        printf("WARNING: ***** Unspecified results if hybrid_report_mask "
                              "not called in parallel region of MPI code section. ***** \n");
     }
   }
 
   thrd  =  omp_get_thread_num();        // thread id
   nthrds =  omp_get_num_threads();      // Number of Threads
                                         // Get number of cpus (this gives no. 
                                         // of cpu_ids in /proc/cpuinfo)
   ncpus = (int) sysconf(_SC_NPROCESSORS_ONLN);
 
 
                                  // Working only with MPI processes (masters)
   #pragma omp master
   {
                                  // Get a list of nodes from all ranks.
     MPI_Get_processor_name(proc_name,&name_len);
     MPI_Allreduce(&name_len, &max_name_len, 1,MPI_INT, MPI_MAX, MPI_COMM_WORLD);
     all_names = (char *) malloc(sizeof(int*)*nranks*(max_name_len+1));
     MPI_Gather( proc_name, max_name_len+1 , MPI_CHAR,
                 all_names, max_name_len+1, MPI_CHAR,
                 0, MPI_COMM_WORLD);
  
                                  // If multiple nodes, make muti_node non-zero.
     if(rank == 0){
       for(id=0;id<nranks;id++){
         if( strcmp(&all_names[id*(max_name_len+1)],&all_names[0]) ) multi_node++; }

     }
   
                                  // Create shared storage for masks (only master allocates)
 
     omp_proc_mask =                          (int **) malloc(sizeof(int*)*nthrds);
     for(i=0;i<nthrds;i++) omp_proc_mask[i] = (int * ) malloc(sizeof(int )*ncpus  );
     for(i=0;i<nthrds;i++) for(j=0;j<ncpus;j++) omp_proc_mask[i][j] =0;
 
   }
   #pragma omp barrier

   #pragma omp critical           // (boundto -- may not be thread safe)
     ierr = boundto(&nel_set,omp_proc_mask[thrd]);
   #pragma omp barrier
 
   #pragma omp master
   {
 
     omp_mask_pac = (int *) malloc(sizeof(int)*nranks*nthrds*ncpus);  // need packing space for mpi send/recv

     if(rank == 0){
       request = (MPI_Request *) malloc(sizeof(MPI_Request)*nranks);
       status  = (MPI_Status  *) malloc(sizeof(MPI_Status )*nranks);

       print_mask(1,  dummy,  multi_node,     0, 0,   ncpus, nranks,nthrds, omp_proc_mask[0],tpc,l);  //print header 
       fflush(stdout);

       for(tid=0;tid<nthrds;tid++){
          print_mask(0,  &all_names[tid*(max_name_len+1)], multi_node,  0,tid,   ncpus, nranks,nthrds, omp_proc_mask[tid],tpc,l);
       }
       fflush(stdout);
         
       for(rid=1;rid<nranks;rid++){ // Receive other rank's packed mask arrays
         MPI_Irecv(&omp_mask_pac[rid*nthrds*ncpus], nthrds*ncpus, MPI_INT, rid, 99, MPI_COMM_WORLD, &request[rid-1]);
       } 

       MPI_Waitall(nranks-1,&request[0],&status[0]);

       for(rid=1;rid<nranks;rid++){ // Print for each rank
            for(tid=0;tid<nthrds;tid++){
               print_mask(0,  &all_names[tid*(max_name_len+1)], multi_node,  rid,tid,   ncpus, nranks,nthrds, &omp_mask_pac[rid*nthrds*ncpus + tid*ncpus],tpc,l);
               if(p == 's') ierr=usleep(300000);
            }
       }

       if(nranks*nthrds > 50)
          print_mask(2,  dummy,  multi_node,     0, 0,   ncpus, nranks,nthrds, omp_proc_mask[0],tpc,l);  //print header 

       fflush(stdout);

     }   // end root printing

     else{  //all non-root ranks

                                       // Pack up the ranks' mask arrays (Uh, should have made one array from beginning!) 
          for(   tid=0;tid<nthrds;tid++){
             for( id=0; id<ncpus;  id++)  omp_mask_pac[(tid*ncpus)+id] = omp_proc_mask[tid][id];
             if(p == 's') ierr=usleep(300000);
          }
                                       // Send to root
          MPI_Send(omp_mask_pac, nthrds*ncpus, MPI_INT, 0, 99, MPI_COMM_WORLD);
               

     } // end non-root printing

                                  // Return allocated space

     for(i=0;i<nthrds;i++) free(omp_proc_mask[i]);
                           free(omp_proc_mask);
     free(omp_mask_pac);
     if(rank == 0 ){ free(request); free(status);}
     free(all_names);

   } // end of Master

   #pragma omp barrier            // JIC, what all threads leaving at the same time.

}
int hybrid_report_mask(void){

int thrd, nthrds;
int rank, nranks;

static int multi_node = 0;

int ncpus, nel_set;

static int ** omp_proc_mask;
static int  * omp_mask_pac;
char *dummy;


       char   proc_name[MPI_MAX_PROCESSOR_NAME];
static char * all_names;
       int    name_len;
static int    max_name_len;

                          // General
int i,j,ierr;
int id, rid,tid;
int in_mpi, in_omp;

                          // Mask storage
       int ** proc_mask;
static int  * all_masks=0;


   MPI_Initialized(&in_mpi);
   in_omp = omp_in_parallel();

   if(in_mpi != 0 && in_omp == 0){

                     // Get number of cpus (this gives no. of cpu_ids in /proc/cpuinfo)
                     // Get rank number & no of ranks via MPI
     ncpus = (int) sysconf(_SC_NPROCESSORS_ONLN);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     MPI_Comm_size(MPI_COMM_WORLD, &nranks);
  
                     // Create a 2-D array for mask
                     // proc_mask[rank][ncpus] -- for simplicity, size is [ncpus][ncpus]
                     // Thinking ahead for hybrid code.
  
                     // zero out proc_mask[ncpus][ncpus]  
                     // I could have made proc_mask a single array (proc_mask[ncpus]); but didn't
                     // This is a hold-over from the openmp version that holds everything for all threads.
                     // For MPI I made a continguous collection array (all_masks).
     proc_mask =          malloc(sizeof(int*)*ncpus);
     for(i=0;i<ncpus;i++) proc_mask[i] =  malloc(sizeof(int)*ncpus);
     for(i=0;i<ncpus;i++) for(j=0;j<ncpus;j++) proc_mask[i][j] =0;
     all_masks =  (int *) malloc(sizeof(int)*ncpus*ncpus);
  
                               // get map for this processor
     ierr=boundto(&nel_set,proc_mask[rank]);

                                  // Gather information to rank 0
  
     MPI_Gather( proc_mask[rank], ncpus, MPI_INT,
                 all_masks,    ncpus, MPI_INT,
                 0, MPI_COMM_WORLD);
  
  
                                  // Get a list of nodes from all ranks.
     MPI_Get_processor_name(proc_name,&name_len);
     MPI_Allreduce(&name_len, &max_name_len, 1,MPI_INT, MPI_MAX, MPI_COMM_WORLD);
     all_names = malloc(sizeof(int*)*nranks*(max_name_len+1));
     MPI_Gather( proc_name, max_name_len+1 , MPI_CHAR,
                 all_names, max_name_len+1, MPI_CHAR,
                 0, MPI_COMM_WORLD);
  
                                  // If multiple nodes, make muti_node not equal to 0.
     if(rank == 0)
        for(id=0;id<nranks;id++){
           if( strcmp(&all_names[id*(max_name_len+1)],&all_names[0]) ) multi_node++;
        }

   }  // End of Pure MPI part


   if(in_mpi != 0 && in_omp != 0){

 
       if(all_masks == 0) {
          printf("ERROR: ***** You must call hybrid_report_mask() in a Pure MPI region first. ***** \n");
          exit(1);
       }
   
        thrd  =  omp_get_thread_num();
       nthrds =  omp_get_num_threads();
       ncpus  =  (int) sysconf(_SC_NPROCESSORS_ONLN);
    
    
       #pragma omp single
       {
          omp_proc_mask =                           malloc(sizeof(int*)*nthrds);
          for(i=0;i<nthrds;i++) omp_proc_mask[i] =  malloc(sizeof(int)*ncpus  );
          for(i=0;i<nthrds;i++) for(j=0;j<ncpus;j++) omp_proc_mask[i][j] =0;
       }
    
       #pragma omp critical
       ierr = boundto(&nel_set,omp_proc_mask[thrd]);
 
       #pragma omp barrier
       MPI_Comm_size(MPI_COMM_WORLD, &nranks);
       MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
       #pragma omp master
       {
 
          omp_mask_pac = (int *) malloc(sizeof(int)*nranks*ncpus);  // need packing space for mpi send/recv
 
          if(rank == 0){
 
               print_mask(1,  dummy,  multi_node,     0, 0,   ncpus, nthrds,nranks, omp_proc_mask[0]);  //print header 
               fflush(stdout);
   
               for(tid=0;tid<nthrds;tid++){
                  print_mask(0,  &all_names[tid*(max_name_len+1)], multi_node,  0,tid,   ncpus, nthrds,nranks, omp_proc_mask[tid]);
               }
               fflush(stdout);
 
            for(rid=1;rid<nranks;rid++){
                                            // Receive other rank's packed mask arrays
               MPI_Recv(omp_mask_pac, nthrds*ncpus, MPI_INT, rid, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
 
               for(tid=0;tid<nthrds;tid++){
                  print_mask(0,  &all_names[rid*(max_name_len+1)], multi_node,  rid,tid,   ncpus, nthrds,nranks, &omp_mask_pac[tid*ncpus]);
               }
               fflush(stdout);
 
            } // rank loop
          }   // end root printing
 
          else{  //all other ranks
                                            // All non-root ranks send to root. 
 
            for(rid=1;rid<nranks;rid++){
                                            // Pack up the ranks' mask arrays (Uh, should have made one array from beginning!) 
               for(   tid=0;tid<nthrds;tid++){
                  for( id=0; id<ncpus;  id++)  omp_mask_pac[(tid*ncpus)+id] = omp_proc_mask[tid][id];
               }
                                            // Send to root
               MPI_Send(omp_mask_pac, nthrds*ncpus, MPI_INT, 0, 99, MPI_COMM_WORLD);
                    
            }  //all other ranks
 
 
          } // end non-root printing

          MPI_Barrier(MPI_COMM_WORLD);
 
 
       } // end of Master
       #pragma omp barrier
 
   } // end of OpenMP  part
}