void cp_transpose_bck_prim_dvr(double *c,int *icoef_form,double *c_temp, int nstate,int nstate_max,int ncoef, int nstate_proc,int nstate_proc_max, int nstate_proc_min, int nstate_ncoef_proc_max, int nstate_ncoef_proc, int nkf1,int nkf2,int nkf3, int num_proc,int myid,MPI_Comm comm) /*======================================================================*/ /* Begin Routine */ { /*begin routine */ /*======================================================================*/ /* Local variable declarations */ #include "../typ_defs/typ_mask.h" int nfull,i,j; int idv,irm; int irems,iremc,irem,ig,is,ioff; int ioff_c,iproc,iii; int sendcounts,recvcounts; int icount; int *ioffv,*inum; /*========================================================================*/ /* Incoming variable declarations */ /* */ /* nstate = Total number of states */ /* nstate_proc = number of states on this processor */ /* nstate_proc_max = maximum number of states on any processor */ /* ncoef = Total number of coefficients in a state */ /* nstate_ncoef_proc = Number of coefficients in a state on this processor*/ /* in the transposed data */ /* nstate_ncoef_proc_max = Maximum number of coefficients in a state on */ /* any processesor in the transposed data */ /* nstate_max = nstate_proc_max*num_proc */ /* c = nstate_proc x ncoef array of coefficients */ /* c_temp = transposed data stored as nstate x ncoef_proc_max */ /* ct_temp = scratch space to help make transposed data */ /* nscr_size = size of scratch nstate_ncoef_proc_max*nstate_max */ /*========================================================================*/ /* 0) Check the forms */ if((*icoef_form) !=1){ printf("@@@@@@@@@@@@@@@@@@@@_ERROR_@@@@@@@@@@@@@@@@@@@@\n"); printf("The coefficients must be in transposed form\n"); printf("on state processor %d in cp_transpose_bck_prim \n",myid); printf("@@@@@@@@@@@@@@@@@@@@_ERROR_@@@@@@@@@@@@@@@@@@@@\n"); fflush(stdout);exit(1); }/*endif*/ *icoef_form = 0; /*========================================================================*/ /*========================================================================*/ /* I) Internal rearrangement of coeff data */ irems = (nstate % num_proc); iremc = ((nkf2*nkf3)% num_proc); irem = MAX(irems,iremc); ioffv = (int *) cmalloc(num_proc*sizeof(int))-1; inum = (int *) cmalloc(num_proc*sizeof(int))-1; if((irems != 0)||(iremc != 0)){ ioffv[1] = 0; for(i=1; i<= irems; i++){ ioffv[i+1] = ioffv[i] + nstate_proc_max*nstate_ncoef_proc; inum[i] = nstate_proc_max*nstate_ncoef_proc; } for(i=irems+1; i < num_proc; i++){ ioffv[i+1] = ioffv[i] + nstate_proc_min*nstate_ncoef_proc; inum[i] = nstate_proc_min*nstate_ncoef_proc; } inum[num_proc] = nstate_proc_min*nstate_ncoef_proc; /* 1) copy data into temp array */ for(i=1;i <= (num_proc*nstate_proc_max*nstate_ncoef_proc_max);i++){ c_temp[i] = 0.00; } for(i=1; i<= num_proc; i++){ ioff = (i-1)*nstate_proc_max*nstate_ncoef_proc_max; for(j=1; j <= inum[i]; j++){ c_temp[(ioff+j)] = c[(ioffv[i]+j)]; } } /* 2) copy back into c */ nfull = nstate_ncoef_proc_max*nstate_max; for(ig=1;ig<=nfull;ig++){c[ig] = c_temp[ig];} }/*endif: remainder */ /*======================================================================*/ /* II) Send the transformed position data */ sendcounts = nstate_ncoef_proc_max*nstate_proc_max; recvcounts = nstate_ncoef_proc_max*nstate_proc_max; Alltoall(&c[1],sendcounts,MPI_DOUBLE,&c_temp[1],recvcounts, MPI_DOUBLE,comm); /*=======================================================================*/ /* III) Extract the transformed position data */ idv = (nkf2*nkf3)/num_proc; irm = (nkf2*nkf3)%num_proc; for(i = 0; i < num_proc; i++){ inum[i+1] = ( i < irm ? (idv+1)*nkf1 : idv*nkf1); } for(i=1;i<=nstate_ncoef_proc_max*nstate_max;i++){c[i]=0.0;} for(is=1; is <= nstate_proc; is++){ icount = 0; ioff_c = (is-1)*ncoef; for(iproc=1;iproc<=num_proc;iproc++){ ioff = (is-1)*inum[iproc] + (iproc-1)*nstate_proc_max*nstate_ncoef_proc_max; for(i=1;i<= inum[iproc];i++){ icount++; c[ioff_c+icount] = c_temp[ioff+i]; }/*endfor*/ }/*endfor*/ } cfree(&ioffv[1]); cfree(&inum[1]); /*========================================================================*/ }/*end routine*/
int main(int argc, char**argv){ int num_ranks, rank, split_num_ranks, split_rank; int outer_ranks, inner_ranks; int new_comm_id; int msg_size, loops; int slurm_id, run_index; MPI_Comm split_comm; FILE * timings; //Parse options char c; while ((c = getopt (argc, argv, "s:r:l:i:")) != -1){ switch (c) { case 's': sscanf(optarg, "%d", &msg_size); break; case 'r': sscanf(optarg, "%d", &inner_ranks); break; case 'l': sscanf(optarg, "%d", &loops); break; case 'i': sscanf(optarg, "%d", &run_index); break; default: printf("Unrecognized option: %c\n", optopt); break; } if(c != 's' && c != 'i' && c != 'l' && c != 'r' ){break;} } printf("Successfully parsed options as: \n"); printf("\tmsg_size: %d, inner_ranks: %d, loops: %d, run_index: %d\n", msg_size, inner_ranks, loops, run_index); //Open timings.out for writing timings = fopen("timings.out", "a"); if(timings == NULL){ printf("Error: cannot open timings.out\n"); } //Start MPI, get num_ranks MPI_Init(NULL, NULL); MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); if(num_ranks == 0){ printf("MPI_Comm_size failure\n"); exit(1); } //Calculate comm sizes outer_ranks = num_ranks - inner_ranks; if( (outer_ranks < 0 || inner_ranks < 0) && (rank == 0) ){ printf("Error: bad comm sizes. They should be positive\n"); } //Get global rank MPI_Comm_rank(MPI_COMM_WORLD, &rank); int * splitter = (int*)malloc(sizeof(int)*num_ranks); for(int i = inner_ranks; i < num_ranks; i++) splitter[i] = OUTER_COMM; for(int i = 0; i < inner_ranks; i++) splitter[i] = INNER_COMM; //split communicator MPI_Comm_split(MPI_COMM_WORLD, splitter[rank], 1, &split_comm); MPI_Comm_size(split_comm, &split_num_ranks); MPI_Comm_rank(split_comm, &split_rank); MPI_Barrier(MPI_COMM_WORLD); //run the inner communicator as a warm-up, seems to reduce variance if(splitter[rank] == INNER_COMM){ Alltoall(split_comm, split_num_ranks, split_rank, msg_size, loops); } MPI_Barrier(MPI_COMM_WORLD); //start network counters region 1 MPI_Pcontrol(1); //run the inside alone, as a baseline float run1; if(splitter[rank] == INNER_COMM){ run1 = Alltoall(split_comm, split_num_ranks, split_rank, msg_size, loops); } MPI_Barrier(MPI_COMM_WORLD); //start network counters region 2 MPI_Pcontrol(2); //run both communicators float run2; if(splitter[rank] == INNER_COMM){ run2 = Alltoall(split_comm, split_num_ranks, split_rank, msg_size, loops); }else{ Alltoall(split_comm, split_num_ranks, split_rank, msg_size, loops); } //stop network counters MPI_Pcontrol(0); //print timings if(splitter[rank] == INNER_COMM && split_rank==0) fprintf(timings, "%d,%f,%f\n", run_index, run1, run2); //free(recv); free(splitter); MPI_Finalize(); exit(0); }
void cp_state_gvec_trans_bck(double *c,double *c_temp,double *ct_temp, int nstate,int nstate_max,int ncoef, int nstate_proc,int nstate_proc_max, int nstate_ncoef_proc_max,int nstate_ncoef_proc, int num_proc,int myid,MPI_Comm comm) /*======================================================================*/ /* Begin Routine */ { /*begin routine */ /*======================================================================*/ /* Local variable declarations */ double *c_temp_pt,*ct_temp_pt; int nfull,nblock; int irem,ig,is,ioff; int i,ioff_temp; int ioff_c; int joff; int iproc,itemp,iii; int proc_rem; int nstate_ncoef_proc_now; int sendcounts,recvcounts; /*========================================================================*/ /* Incoming variable declarations */ /* nstate = Total number of states */ /* nstate_proc = number of states on this processor */ /* nstate_proc_max = maximum number of states on any processor */ /* ncoef = Total number of coefficients in a state */ /* nstate_ncoef_proc = Number of coefficients in a state on this processor*/ /* in the transposed data */ /* nstate_ncoef_proc_max = Maximum number of coefficients in a state on */ /* any processesor in the transposed data */ /* nstate_max = nstate_proc_max*num_proc */ /* c = nstate_proc x ncoef array of coefficients */ /* c_temp = transposed data stored as nstate x ncoef_proc_max */ /* ct_temp = scratch space to help make transposed data */ /* nscr_size = size of scratch nstate_ncoef_proc_max*nstate_max */ /*========================================================================*/ /* I) Internal rearrangement of coeff data */ irem = (nstate % num_proc); if(irem != 0){ /*------------------------------------------------------------------------*/ /* A) copy full blocks */ nfull = nstate_proc_max*nstate_ncoef_proc_max*irem; for(ig=1;ig<=nfull;ig++){ct_temp[ig] = c_temp[ig];} /*------------------------------------------------------------------------*/ /* B) copy partial blocks */ nblock = (nstate_proc_max-1)*nstate_ncoef_proc_max; ioff = nfull; joff=nfull; for(iproc=irem+1;iproc<=num_proc;iproc++){ for(ig=1;ig<=nblock;ig++){ct_temp[(ig+joff)] = c_temp[(ig+ioff)];} ioff += nblock;joff += (nblock+nstate_ncoef_proc_max); }/*endfor*/ }else{ /*------------------------------------------------------------------------*/ /* A) copy all blocks */ nfull = nstate_ncoef_proc_max*nstate; for(ig=1;ig<=nfull;ig++){ct_temp[ig] = c_temp[ig];} }/*endif: remainder */ #ifdef DEBUG if(myid==0){ printf("1st Step: This is ct_temp\n"); }/*endif*/ for(is=1;is<=num_proc;is++){ if(myid==is-1){ for(ig=1;ig<=nstate_ncoef_proc_max*nstate;ig++){ printf("%d %d %g\n",ig,is,ct_temp[ig]); }/*endfor*/ }/*endif*/ scanf("%d",&ig); Barrier(comm); }/*endfor*/ #endif /*========================================================================*/ /* II) Send the transformed position data */ sendcounts = nstate_ncoef_proc_max*nstate_proc_max; recvcounts = nstate_ncoef_proc_max*nstate_proc_max; ct_temp_pt = ct_temp+1; c_temp_pt = c_temp+1; Alltoall(ct_temp_pt,sendcounts,MPI_DOUBLE,c_temp_pt,recvcounts, MPI_DOUBLE,comm); #ifdef DEBUG if(myid==0){ printf("2nd step: This is c_temp\n"); }/*endif*/ for(is=1;is<=num_proc;is++){ if(myid==is-1){ for(ig=1;ig<=nstate_ncoef_proc_max*nstate_max;ig++){ printf("%d %d %g\n",ig,is,c_temp[ig]); }/*endfor*/ }/*endif*/ scanf("%d",&ig); Barrier(comm); }/*endfor*/ #endif /*========================================================================*/ /* III) Extract the transformed position data */ for(i=1;i<=nstate_ncoef_proc_max*nstate_max;i++){c[i]=0.0;} proc_rem = ncoef % num_proc; for(is=1;is<=nstate_proc;is++){ ioff = 0; ioff_c = (is-1)*ncoef; for(iproc=1;iproc<=num_proc;iproc++){ ioff_temp = (is-1)*nstate_ncoef_proc_max + (iproc-1)*(nstate_proc_max*nstate_ncoef_proc_max); nstate_ncoef_proc_now = nstate_ncoef_proc_max; if((iproc>proc_rem)&&(proc_rem>0)) nstate_ncoef_proc_now--; for(ig=1;ig<=nstate_ncoef_proc_now;ig++){ itemp = ig+ioff_temp; i = ig+ioff; c[(i+ioff_c)] = c_temp[itemp]; }/*endfor*/ ioff += nstate_ncoef_proc_now; }/*endfor*/ }/*endfor*/ #ifdef DEBUG if(myid==0){ printf("Last step: This is c\n"); }/*endif*/ for(iproc=1;iproc<=num_proc;iproc++){ if(myid==iproc-1){ for(is=1;is<=nstate_proc;is++){ ioff_c = (is-1)*ncoef; for(ig=1;ig<=ncoef;ig++){ printf("%d %d %d %g\n",iproc,is,ig,c[(ig+ioff_c)]); }/*endfor*/ }/*endfor*/ }/*endif*/ scanf("%d",&iii); Barrier(comm); }/*endfor*/ #endif /*========================================================================*/ }/*end routine*/
int main(int argc, char**argv){ int num_ranks, rank, split_num_ranks, split_rank; int outer_ranks, inner_ranks; int new_comm_id; int msg_size, loops; int slurm_id, run_index; MPI_Comm split_comm; FILE * timings, * configs; int assignment; int custom; char c; while ((c = getopt (argc, argv, "s:r:l:i:ac:")) != -1){ switch (c) { case 's': sscanf(optarg, "%d", &msg_size); break; case 'r': sscanf(optarg, "%d", &inner_ranks); break; case 'l': sscanf(optarg, "%d", &loops); break; case 'i': sscanf(optarg, "%d", &run_index); break; case 'a': sscanf(optarg, "%d", &assignment); assignment = 0; break; case 'c': sscanf(optarg, "%d", &custom); break; default: printf("Unrecognized option: %c\n", optopt); break; } if(c != 's' && c != 'i' && c != 'l' && c != 'r' ){break;} } timings = fopen("timings.out", "a"); char configs_buf[128] = {0}; sprintf(configs_buf, "config-%d.out", run_index); configs = fopen(configs_buf, "a"); MPI_Init(NULL, NULL); MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); if(num_ranks == 0){ printf("MPI_Comm_size failure\n"); exit(1); } outer_ranks = num_ranks - inner_ranks; MPI_Comm_rank(MPI_COMM_WORLD, &rank); //get node names char name[MPI_MAX_PROCESSOR_NAME] = {0}; char * recv = (char*)calloc(MPI_MAX_PROCESSOR_NAME*num_ranks, sizeof(char)); int proc_len; MPI_Get_processor_name(name, &proc_len); name[proc_len] = 0; MPI_Gather(name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, recv, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, MPI_COMM_WORLD); int * splitter = (int*)malloc(sizeof(int)*num_ranks); for(int i = 0; i < num_ranks; i++) splitter[i] = OUTER_COMM; if(!custom){ if(rank == 0){ if(assignment == RANDOM){ int num_assigned = 0; while(num_assigned < inner_ranks){ int val = rand() % num_ranks; if(splitter[val] == INNER_COMM){ continue; }else{ splitter[val] = INNER_COMM; num_assigned += 1; } } }else if(assignment == APLANES){ for(int i = 0; i < num_ranks; i++){ int dims[5] = {0}; get_dim(recv + i*MPI_MAX_PROCESSOR_NAME, dims); if (dims[0] == 0 || dims[0] == 2){ splitter[i] = INNER_COMM; } } }else if(assignment == APLANES_COARSE){ for(int i = 0; i < num_ranks; i++){ int dims[5] = {0}; get_dim(recv + i*MPI_MAX_PROCESSOR_NAME, dims); if (dims[0] == 0 || dims[0] == 1){ splitter[i] = INNER_COMM; } } }else if(assignment == BPLANES){ for(int i = 0; i < num_ranks; i++){ int dims[5] = {0}; get_dim(recv + i*MPI_MAX_PROCESSOR_NAME, dims); if (dims[1] == 0 || dims[1] == 2){ splitter[i] = INNER_COMM; } } }else if(assignment == CPLANES){ for(int i = 0; i < num_ranks; i++){ int dims[5] = {0}; get_dim(recv + i*MPI_MAX_PROCESSOR_NAME, dims); if (dims[2] == 0 || dims[2] == 2){ splitter[i] = INNER_COMM; } } }else if(assignment == DPLANES){ for(int i = 0; i < num_ranks; i++){ int dims[5] = {0}; get_dim(recv + i*MPI_MAX_PROCESSOR_NAME, dims); if (dims[3] == 0 || dims[3] == 2){ splitter[i] = INNER_COMM; } } }else if(assignment == EPLANES){ for(int i = 0; i < num_ranks; i++){ int dims[5] = {0}; get_dim(recv + i*MPI_MAX_PROCESSOR_NAME, dims); if (dims[4] == 0){ splitter[i] = INNER_COMM; } } }else if(assignment == SQUAREAB1){ for(int i = 0; i < num_ranks; i++){ int dims[5] = {0}; get_dim(recv + i*MPI_MAX_PROCESSOR_NAME, dims); if (((dims[0] == 0 || dims[0] == 1) && (dims[1] == 0 || dims[1] == 1)) || ((dims[0] == 2 || dims[0] == 3) && (dims[1] == 2 || dims[1] == 3))){ splitter[i] = INNER_COMM; } } }else if(assignment == SQUAREAB2){ for(int i = 0; i < num_ranks; i++){ int dims[5] = {0}; get_dim(recv + i*MPI_MAX_PROCESSOR_NAME, dims); if (((dims[0] == 0 || dims[0] == 2) && (dims[1] == 0 || dims[1] == 2)) || ((dims[0] == 1 || dims[0] == 3) && (dims[1] == 1 || dims[1] == 3))){ splitter[i] = INNER_COMM; } } }else if(assignment == ALTERABC_NONE){ for(int i = 0; i < num_ranks; i++){ int dims[5] = {0}; get_dim(recv + i*MPI_MAX_PROCESSOR_NAME, dims); if (((dims[0] == 0 || dims[0] == 1) && (dims[1] == 0 || dims[1] == 1) && (dims[2] == 0 || dims[2] == 1)) || ((dims[0] == 2 || dims[0] == 3) && (dims[1] == 2 || dims[1] == 3) && (dims[2] == 0 || dims[2] == 1)) || ((dims[0] == 0 || dims[0] == 1) && (dims[1] == 2 || dims[1] == 3) && (dims[2] == 2 || dims[2] == 3)) || ((dims[0] == 2 || dims[0] == 3) && (dims[1] == 0 || dims[1] == 1) && (dims[2] == 2 || dims[2] == 3))) { splitter[i] = INNER_COMM; } } }else if(assignment == ALTERABC_ALL){ for(int i = 0; i < num_ranks; i++){ int dims[5] = {0}; get_dim(recv + i*MPI_MAX_PROCESSOR_NAME, dims); if (((dims[0] == 0 || dims[0] == 2) && (dims[1] == 0 || dims[1] == 2) && (dims[2] == 0 || dims[2] == 2)) || ((dims[0] == 1 || dims[0] == 3) && (dims[1] == 1 || dims[1] == 3) && (dims[2] == 0 || dims[2] == 2)) || ((dims[0] == 0 || dims[0] == 2) && (dims[1] == 1 || dims[1] == 3) && (dims[2] == 1 || dims[2] == 3)) || ((dims[0] == 1 || dims[0] == 3) && (dims[1] == 0 || dims[1] == 2) && (dims[2] == 1 || dims[2] == 3))) { splitter[i] = INNER_COMM; } } } } }else{ //using custon mapping in map.out for(int i = 0; i < num_ranks/2; i++){ splitter[i] = INNER_COMM; } } MPI_Bcast(splitter, num_ranks, MPI_INT, 0, MPI_COMM_WORLD); //split communicator MPI_Comm_split(MPI_COMM_WORLD, splitter[rank], 1, &split_comm); MPI_Comm_size(split_comm, &split_num_ranks); MPI_Comm_rank(split_comm, &split_rank); MPI_Barrier(MPI_COMM_WORLD); //print names to file if(rank == 0){ fprintf(configs,"rank,comm,node\n"); for(int i = 0; i < num_ranks; i++){ fprintf(configs,"%d,%d,%s\n", i, splitter[i], recv + i*MPI_MAX_PROCESSOR_NAME); } } //run the inner communicator as a warm-up, seems to reduce variance if(splitter[rank] == INNER_COMM){ Alltoall(split_comm, split_num_ranks, split_rank, msg_size, loops); } MPI_Barrier(MPI_COMM_WORLD); //run the inside alone, as a baseline //start network counters region 1 MPI_Pcontrol(1); float run1; if(splitter[rank] == INNER_COMM){ run1 = Alltoall(split_comm, split_num_ranks, split_rank, msg_size, loops); } MPI_Barrier(MPI_COMM_WORLD); //start network counters region 2 MPI_Pcontrol(2); //run both communicators float run2; if(splitter[rank] == INNER_COMM){ run2 = Alltoall(split_comm, split_num_ranks, split_rank, msg_size, loops); }else{ Alltoall(split_comm, split_num_ranks, split_rank, msg_size, loops); } //stop network counters MPI_Pcontrol(0); //print timings if(splitter[rank] == INNER_COMM && split_rank==0) fprintf(timings, "%d,%f,%f\n", run_index, run1, run2); //free(recv); free(splitter); MPI_Finalize(); exit(0); }