bool Channel::Receive(Message & message) { // do a bunch of MPI_Test ring.TestBuffers(); int source = MPI_ANY_SOURCE; int tag = MPI_ANY_TAG; MPI_Status status; int flag = 0; /* MPI_Iprobe(source, tag, messagingCommunicator, &flag, &status); */ MPI_Probe(source, tag, messagingCommunicator, &status); flag = 1; probeOperations ++; if(!flag) return false; source = status.MPI_SOURCE; tag = status.MPI_TAG; message.SetTag(tag); MPI_Datatype datatype = MPI_BYTE; int count = 0; MPI_Get_count(&status, datatype, &count); MPI_Recv(receivingBuffer, count, datatype, source, tag, messagingCommunicator, &status); int sourceActor = -1; int destinationActor = -1; memcpy(&sourceActor, receivingBuffer + sourceActorOffset, sizeof(int)); memcpy(&destinationActor, receivingBuffer + destinationActorOffset, sizeof(int)); char * content = message.GetContent(); count -= contentOffset; memcpy(content, receivingBuffer + contentOffset, count * sizeof(char)); message.SetContentSize(count); message.SetSource(sourceActor); message.SetDestination(destinationActor); /* cout << "Channel got a message with tag "; cout << tag << ", "; cout << message.GetSource() << " -> " << source; cout << " " << message.GetDestination(); cout << " -> " << rank << endl; */ messagesReceived++; return true; }
void mpi_get_count_(MPI_Status * status, int* datatype, int *count, int* ierr){ *ierr = MPI_Get_count(status, get_datatype(*datatype), count); }
int main(int argc, const char *argv[]) { // init mpi MPI_Init(NULL,NULL); // get processor standing int rank = 0, world = 0; MPI_Comm_size(MPI_COMM_WORLD, &world); // get rank of this processor MPI_Comm_rank(MPI_COMM_WORLD, &rank); // send some random numbers int number_amount; if(rank == 0) { const int MAX_NUMBERS = 10; int numbers[MAX_NUMBERS]; Random rg; number_amount = rg.getRandom(0,MAX_NUMBERS-1); // make numbers for(int i = 0; i < number_amount; i++) numbers[i] = rg.getRandom(-10,10); // send MPI_Send(numbers,number_amount,MPI_INT,1,0,MPI_COMM_WORLD); printf("0 sends %d numbers to 1\n",number_amount); for(int i = 0; i < number_amount; i++) printf("%d, ", numbers[i]); printf("\n"); }else if(rank == 1) { MPI_Status status; // probe for message and size // MPI_Probe(0,0,MPI_COMM_WORLD,&status); // check status out to find out how many numbers were actually sent //MPI_Get_count(&status, MPI_INT, &number_amount); //int * numbers = new int[number_amount]; // recieve from 0 //MPI_Recv(numbers,number_amount,MPI_INT,0,0,MPI_COMM_WORLD, &status); void * numbers = NULL; MPI_ProbeRecv(&numbers,MPI_INT,0,0,MPI_COMM_WORLD,&status); int number_amount = 0; MPI_Get_count(&status,MPI_INT,&number_amount); printf("1 received %d numbers from 0. Message source = %d, " "tag = %d\n",number_amount, status.MPI_SOURCE, status.MPI_TAG); // print numbers printf("Try to print numbers...\n"); int * tmp = (int *)numbers; for(int i = 0; i < number_amount; i++) printf("%d, ", tmp[i]); printf("\n"); free(numbers); } // end session MPI_Finalize(); }
/* * Implements a blocking receive operation. * mpi_recv(?Source,?Tag,-Data). */ static YAP_Bool mpi_recv(term_t YAP_ARG1,...) { YAP_Term t1 = YAP_Deref(YAP_ARG1), t2 = YAP_Deref(YAP_ARG2), t3 = YAP_Deref(YAP_ARG3), t4; int tag, orig; int len=0; MPI_Status status; //The third argument (data) must be unbound if(!YAP_IsVarTerm(t3)) { return false; } /* The first argument (Source) must be bound to an integer (the rank of the source) or left unbound (i.e. any source is OK) */ if (YAP_IsVarTerm(t1)) orig = MPI_ANY_SOURCE; else if( !YAP_IsIntTerm(t1) ) return false; else orig = YAP_IntOfTerm(t1); /* The second argument must be bound to an integer (the tag) or left unbound (i.e. any tag is OK) */ if (YAP_IsVarTerm(t2)) tag = MPI_ANY_TAG; else if( !YAP_IsIntTerm(t2) ) return false; else tag = YAP_IntOfTerm( t2 ); CONT_TIMER(); // probe for term' size if( MPI_CALL(MPI_Probe( orig, tag, MPI_COMM_WORLD, &status )) != MPI_SUCCESS) { PAUSE_TIMER(); return false; } if( MPI_CALL(MPI_Get_count( &status, MPI_CHAR, &len )) != MPI_SUCCESS || status.MPI_TAG==MPI_UNDEFINED || status.MPI_SOURCE==MPI_UNDEFINED) { PAUSE_TIMER(); return false; } //realloc memory buffer change_buffer_size((size_t)(len+1)); BUFFER_LEN=len; // Already know the source from MPI_Probe() if( orig == MPI_ANY_SOURCE ) { orig = status.MPI_SOURCE; if( !YAP_Unify(t1, YAP_MkIntTerm(orig))) { PAUSE_TIMER(); return false; } } // Already know the tag from MPI_Probe() if( tag == MPI_ANY_TAG ) { tag = status.MPI_TAG; if( !YAP_Unify(t2, YAP_MkIntTerm(status.MPI_TAG))) { PAUSE_TIMER(); return false; } } // Receive the message as a string if( MPI_CALL(MPI_Recv( BUFFER_PTR, BUFFER_LEN, MPI_CHAR, orig, tag, MPI_COMM_WORLD, &status )) != MPI_SUCCESS ) { /* Getting in here should never happen; it means that the first package (containing size) was sent properly, but there was a glitch with the actual content! */ PAUSE_TIMER(); return false; } #ifdef DEBUG write_msg(__FUNCTION__,__FILE__,__LINE__,"%s(%s,%u, MPI_CHAR,%d,%d)\n",__FUNCTION__,BUFFER_PTR, BUFFER_LEN, orig, tag); #endif MSG_RECV(BUFFER_LEN); t4=string2term(BUFFER_PTR,&BUFFER_LEN); PAUSE_TIMER(); return(YAP_Unify(YAP_ARG3,t4)); }
/* Master distributes work to slaves */ void do_MPImaster_cluster(WorkPtr work) { #ifdef MPI FILE * checkfile; int i,k,nlen,tranche,client,maxlen,err,rlen,maxload,minload; int bound[2]; int checkpoint; int round=0; int w=(SEQELTWIDTH/2); int *last_sent, *last_got; MPI_Status status; int seq_data[2]; maxlen = MPIBUFRECSZ*num_seqs; buffer = (int32_t *) calloc(maxlen,sizeof(int32_t)); last_sent = (int *) calloc(numprocs,sizeof(int)); last_got = (int *) calloc(numprocs,sizeof(int)); bzero(last_sent,sizeof(int)*numprocs); bzero(last_got,sizeof(int)*numprocs); seq_data[0]=num_seqs; seq_data[1]=data_size; mpierr(MPI_Bcast(seq_data,2,MPI_INT,0,MPI_COMM_WORLD)); mpierr(MPI_Bcast(seqInfo,2*num_seqs,MPI_INT,0,MPI_COMM_WORLD)); mpierr(MPI_Bcast(data,data_size,MPI_SHORT,0,MPI_COMM_WORLD)); /* divide work up */ tranche = (num_seqs+8)/16/numprocs; /* Now wait for requests from slaves */ for(i=0; i<num_seqs; i=i+tranche) { round++; if (round < prog_opts.restore) continue; checkpoint = 0; bound[0]=i; bound[1]=MIN(num_seqs,i+tranche); if (prog_opts.checkpoint) bound[1]=-bound[1]; //printf("Master waits for client answer <%d,%d>\n",bound[0],bound[1]); merr =MPI_Recv(&client, 1, MPI_INT, MPI_ANY_SOURCE, WORKTAG, MPI_COMM_WORLD,&status); mpierr(merr); //printf("Master gets note from client %d\n",client); if (client < 0) { // client wants to send checkpoint client = -client; checkpoint = 1; } // The round previously sent to the client is done last_got[client] = last_sent[client]; // Record the current round send to the client last_sent[client]=round; // DBg printf("Master sends new work to client %d\n",client); merr =MPI_Send(bound, 2, MPI_INT, client, WORKTAG, MPI_COMM_WORLD); mpierr(merr); if (prog_opts.checkpoint) { if (checkpoint) { printf("Master to receive checkpoint\n"); err = MPI_Recv(buffer, maxlen, MPI_INT, MPI_ANY_SOURCE, ANSTAG, MPI_COMM_WORLD, &status); mpierr(err); MPI_Get_count(&status, MPI_INT, &rlen); MergeSlaveClusterTable(buffer,rlen); checkfile = fopen(prog_opts.checkpoint,"w"); for(k=1; k<numprocs; k++) fprintf(checkfile,"%d\n",last_got[k]); show_clusters(checkfile); fclose(checkfile); } printf("Slave %d sent tranche %d of %d\n", client,round,(num_seqs+1)/tranche); } } bound[0]=-1; for(i=1; i< numprocs; i++) { // Tell them no more work merr = MPI_Recv(&client, 1, MPI_INT, MPI_ANY_SOURCE, WORKTAG, MPI_COMM_WORLD,&status); //printf("Client %d told to finish\n",client); mpierr(merr); if (client < 0) client = -client; mpierr(MPI_Send(bound, 2, MPI_INT, client, WORKTAG, MPI_COMM_WORLD)); err = MPI_Recv(buffer, maxlen, MPI_INT, MPI_ANY_SOURCE, ANSTAG, MPI_COMM_WORLD, &status); mpierr(err); MPI_Get_count(&status, MPI_INT, &rlen); MergeSlaveClusterTable(buffer,rlen); } #endif }
static PetscErrorCode PCSetUp_Redistribute(PC pc) { PC_Redistribute *red = (PC_Redistribute*)pc->data; PetscErrorCode ierr; MPI_Comm comm; PetscInt rstart,rend,i,nz,cnt,*rows,ncnt,dcnt,*drows; PetscLayout map,nmap; PetscMPIInt size,imdex,tag,n; PetscInt *source = PETSC_NULL; PetscMPIInt *nprocs = PETSC_NULL,nrecvs; PetscInt j,nsends; PetscInt *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen; PetscInt *rvalues,*svalues,recvtotal; PetscMPIInt *onodes1,*olengths1; MPI_Request *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL; MPI_Status recv_status,*send_status; Vec tvec,diag; Mat tmat; const PetscScalar *d; PetscFunctionBegin; if (pc->setupcalled) { ierr = KSPGetOperators(red->ksp,PETSC_NULL,&tmat,PETSC_NULL);CHKERRQ(ierr); ierr = MatGetSubMatrix(pc->pmat,red->is,red->is,MAT_REUSE_MATRIX,&tmat);CHKERRQ(ierr); ierr = KSPSetOperators(red->ksp,tmat,tmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); } else { PetscInt NN; ierr = PetscObjectGetComm((PetscObject)pc,&comm);CHKERRQ(ierr); ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)pc,&tag);CHKERRQ(ierr); /* count non-diagonal rows on process */ ierr = MatGetOwnershipRange(pc->mat,&rstart,&rend);CHKERRQ(ierr); cnt = 0; for (i=rstart; i<rend; i++) { ierr = MatGetRow(pc->mat,i,&nz,PETSC_NULL,PETSC_NULL);CHKERRQ(ierr); if (nz > 1) cnt++; ierr = MatRestoreRow(pc->mat,i,&nz,PETSC_NULL,PETSC_NULL);CHKERRQ(ierr); } ierr = PetscMalloc(cnt*sizeof(PetscInt),&rows);CHKERRQ(ierr); ierr = PetscMalloc((rend - rstart - cnt)*sizeof(PetscInt),&drows);CHKERRQ(ierr); /* list non-diagonal rows on process */ cnt = 0; dcnt = 0; for (i=rstart; i<rend; i++) { ierr = MatGetRow(pc->mat,i,&nz,PETSC_NULL,PETSC_NULL);CHKERRQ(ierr); if (nz > 1) rows[cnt++] = i; else drows[dcnt++] = i - rstart; ierr = MatRestoreRow(pc->mat,i,&nz,PETSC_NULL,PETSC_NULL);CHKERRQ(ierr); } /* create PetscLayout for non-diagonal rows on each process */ ierr = PetscLayoutCreate(comm,&map);CHKERRQ(ierr); ierr = PetscLayoutSetLocalSize(map,cnt);CHKERRQ(ierr); ierr = PetscLayoutSetBlockSize(map,1);CHKERRQ(ierr); ierr = PetscLayoutSetUp(map);CHKERRQ(ierr); rstart = map->rstart; rend = map->rend; /* create PetscLayout for load-balanced non-diagonal rows on each process */ ierr = PetscLayoutCreate(comm,&nmap);CHKERRQ(ierr); ierr = MPI_Allreduce(&cnt,&ncnt,1,MPIU_INT,MPI_SUM,comm);CHKERRQ(ierr); ierr = PetscLayoutSetSize(nmap,ncnt);CHKERRQ(ierr); ierr = PetscLayoutSetBlockSize(nmap,1);CHKERRQ(ierr); ierr = PetscLayoutSetUp(nmap);CHKERRQ(ierr); ierr = MatGetSize(pc->pmat,&NN,PETSC_NULL);CHKERRQ(ierr); ierr = PetscInfo2(pc,"Number of diagonal rows eliminated %d, percentage eliminated %g\n",NN-ncnt,((PetscReal)(NN-ncnt))/((PetscReal)(NN)));CHKERRQ(ierr); /* this code is taken from VecScatterCreate_PtoS() Determines what rows need to be moved where to load balance the non-diagonal rows */ /* count number of contributors to each processor */ ierr = PetscMalloc2(size,PetscMPIInt,&nprocs,cnt,PetscInt,&owner);CHKERRQ(ierr); ierr = PetscMemzero(nprocs,size*sizeof(PetscMPIInt));CHKERRQ(ierr); j = 0; nsends = 0; for (i=rstart; i<rend; i++) { if (i < nmap->range[j]) j = 0; for (; j<size; j++) { if (i < nmap->range[j+1]) { if (!nprocs[j]++) nsends++; owner[i-rstart] = j; break; } } } /* inform other processors of number of messages and max length*/ ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,nprocs,&nrecvs);CHKERRQ(ierr); ierr = PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);CHKERRQ(ierr); ierr = PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);CHKERRQ(ierr); recvtotal = 0; for (i=0; i<nrecvs; i++) recvtotal += olengths1[i]; /* post receives: rvalues - rows I will own; count - nu */ ierr = PetscMalloc3(recvtotal,PetscInt,&rvalues,nrecvs,PetscInt,&source,nrecvs,MPI_Request,&recv_waits);CHKERRQ(ierr); count = 0; for (i=0; i<nrecvs; i++) { ierr = MPI_Irecv((rvalues+count),olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);CHKERRQ(ierr); count += olengths1[i]; } /* do sends: 1) starts[i] gives the starting index in svalues for stuff going to the ith processor */ ierr = PetscMalloc3(cnt,PetscInt,&svalues,nsends,MPI_Request,&send_waits,size,PetscInt,&starts);CHKERRQ(ierr); starts[0] = 0; for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];} for (i=0; i<cnt; i++) { svalues[starts[owner[i]]++] = rows[i]; } for (i=0; i<cnt; i++) rows[i] = rows[i] - rstart; red->drows = drows; red->dcnt = dcnt; ierr = PetscFree(rows);CHKERRQ(ierr); starts[0] = 0; for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];} count = 0; for (i=0; i<size; i++) { if (nprocs[i]) { ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPIU_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr); } } /* wait on receives */ count = nrecvs; slen = 0; while (count) { ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr); /* unpack receives into our local space */ ierr = MPI_Get_count(&recv_status,MPIU_INT,&n);CHKERRQ(ierr); slen += n; count--; } if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not expected %D",slen,recvtotal); ierr = ISCreateGeneral(comm,slen,rvalues,PETSC_COPY_VALUES,&red->is);CHKERRQ(ierr); /* free up all work space */ ierr = PetscFree(olengths1);CHKERRQ(ierr); ierr = PetscFree(onodes1);CHKERRQ(ierr); ierr = PetscFree3(rvalues,source,recv_waits);CHKERRQ(ierr); ierr = PetscFree2(nprocs,owner);CHKERRQ(ierr); if (nsends) { /* wait on sends */ ierr = PetscMalloc(nsends*sizeof(MPI_Status),&send_status);CHKERRQ(ierr); ierr = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr); ierr = PetscFree(send_status);CHKERRQ(ierr); } ierr = PetscFree3(svalues,send_waits,starts);CHKERRQ(ierr); ierr = PetscLayoutDestroy(&map);CHKERRQ(ierr); ierr = PetscLayoutDestroy(&nmap);CHKERRQ(ierr); ierr = VecCreateMPI(comm,slen,PETSC_DETERMINE,&red->b);CHKERRQ(ierr); ierr = VecDuplicate(red->b,&red->x);CHKERRQ(ierr); ierr = MatGetVecs(pc->pmat,&tvec,PETSC_NULL);CHKERRQ(ierr); ierr = VecScatterCreate(tvec,red->is,red->b,PETSC_NULL,&red->scatter);CHKERRQ(ierr); ierr = VecDestroy(&tvec);CHKERRQ(ierr); ierr = MatGetSubMatrix(pc->pmat,red->is,red->is,MAT_INITIAL_MATRIX,&tmat);CHKERRQ(ierr); ierr = KSPSetOperators(red->ksp,tmat,tmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); ierr = MatDestroy(&tmat);CHKERRQ(ierr); } /* get diagonal portion of matrix */ ierr = PetscMalloc(red->dcnt*sizeof(PetscScalar),&red->diag);CHKERRQ(ierr); ierr = MatGetVecs(pc->pmat,&diag,PETSC_NULL);CHKERRQ(ierr); ierr = MatGetDiagonal(pc->pmat,diag);CHKERRQ(ierr); ierr = VecGetArrayRead(diag,&d);CHKERRQ(ierr); for (i=0; i<red->dcnt; i++) { red->diag[i] = 1.0/d[red->drows[i]]; } ierr = VecRestoreArrayRead(diag,&d);CHKERRQ(ierr); ierr = VecDestroy(&diag);CHKERRQ(ierr); ierr = KSPSetUp(red->ksp);CHKERRQ(ierr); PetscFunctionReturn(0); }
static int scr_swap_files_copy( int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send, int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv, MPI_Comm comm) { int rc = SCR_SUCCESS; MPI_Request request[2]; MPI_Status status[2]; /* allocate MPI send buffer */ char *buf_send = NULL; if (have_outgoing) { buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_send == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* allocate MPI recv buffer */ char *buf_recv = NULL; if (have_incoming) { buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_recv == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* open the file to send: read-only mode */ int fd_send = -1; if (have_outgoing) { fd_send = scr_open(file_send, O_RDONLY); if (fd_send < 0) { scr_abort(-1, "Opening file for send: scr_open(%s, O_RDONLY) errno=%d %s @ %s:%d", file_send, errno, strerror(errno), __FILE__, __LINE__ ); } } /* open the file to recv: truncate, write-only mode */ int fd_recv = -1; if (have_incoming) { mode_t mode_file = scr_getmode(1, 1, 0); fd_recv = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd_recv < 0) { scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d", file_recv, errno, strerror(errno), __FILE__, __LINE__ ); } } /* exchange file chunks */ int nread, nwrite; int sending = 0; if (have_outgoing) { sending = 1; } int receiving = 0; if (have_incoming) { receiving = 1; } while (sending || receiving) { /* if we are still receiving a file, post a receive */ if (receiving) { MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]); } /* if we are still sending a file, read a chunk, send it, and wait */ if (sending) { nread = scr_read(file_send, fd_send, buf_send, scr_mpi_buf_size); if (scr_crc_on_copy && nread > 0) { *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread); } if (nread < 0) { nread = 0; } MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]); MPI_Wait(&request[1], &status[1]); if (nread < scr_mpi_buf_size) { sending = 0; } } /* if we are still receiving a file, * wait on our receive to complete and write the data */ if (receiving) { MPI_Wait(&request[0], &status[0]); MPI_Get_count(&status[0], MPI_BYTE, &nwrite); if (scr_crc_on_copy && nwrite > 0) { *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite); } scr_write(file_recv, fd_recv, buf_recv, nwrite); if (nwrite < scr_mpi_buf_size) { receiving = 0; } } } /* close the files */ if (have_outgoing) { scr_close(file_send, fd_send); } if (have_incoming) { scr_close(file_recv, fd_recv); } /* set crc field on our file if it hasn't been set already */ if (scr_crc_on_copy && have_outgoing) { uLong meta_send_crc; if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) { scr_meta_set_crc32(meta_send, *crc32_send); } else { /* TODO: we could check that the crc on the sent file matches and take some action if not */ } } /* free the MPI buffers */ scr_align_free(&buf_recv); scr_align_free(&buf_send); return rc; }
int set_up_BD ( int * DESCD, double * Dmat, CSRdouble& BT_i, CSRdouble& B_j, CSRdouble& Btsparse ) { // Read-in of matrices X, Z and T from file (filename[X,Z,T]) // X and Z are read in entrely by every process // T is read in strip by strip (number of rows in each process is at maximum = blocksize) // D is constructed directly in a distributed way // B is first assembled sparse in root process and afterwards the necessary parts // for constructing the distributed Schur complement are sent to each process FILE *fT; int ni, i,j, info; int *DESCT; double *Tblock, *temp; int nTblocks, nstrips, pTblocks, stripcols, lld_T, pcol, colcur,rowcur; CSRdouble Xtsparse, Ztsparse,XtT_sparse,ZtT_sparse,XtT_temp, ZtT_temp; Xtsparse.loadFromFile ( filenameX ); Ztsparse.loadFromFile ( filenameZ ); Xtsparse.transposeIt ( 1 ); Ztsparse.transposeIt ( 1 ); XtT_sparse.allocate ( m,k,0 ); ZtT_sparse.allocate ( l,k,0 ); pcol= * ( position+1 ); // Matrix T is read in by strips of size (blocksize * *(dims+1), k) // Strips of T are read in row-wise and thus it is as if we store strips of T' (transpose) column-wise with dimensions (k, blocksize * *(dims+1)) // However we must then also transpose the process grid to distribute T' correctly // number of strips in which we divide matrix T' nstrips= n % ( blocksize * * ( dims+1 ) ) ==0 ? n / ( blocksize * * ( dims+1 ) ) : ( n / ( blocksize * * ( dims+1 ) ) ) +1; //the number of columns of T' included in each strip stripcols= blocksize * * ( dims+1 ); //number of blocks necessary to store complete column of T' nTblocks= k%blocksize==0 ? k/blocksize : k/blocksize +1; //number of blocks necessary in this process to store complete column of T' pTblocks= ( nTblocks - *position ) % *dims == 0 ? ( nTblocks- *position ) / *dims : ( nTblocks- *position ) / *dims +1; pTblocks= pTblocks <1? 1:pTblocks; //local leading dimension of the strip of T' (different from process to process) lld_T=pTblocks*blocksize; // Initialisation of descriptor of strips of matrix T' DESCT= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCT==NULL ) { printf ( "unable to allocate memory for descriptor for Z\n" ); return -1; } // strip of T (k,stripcols) is distributed across ICTXT2D starting in process (0,0) in blocks of size (blocksize,blocksize) // the local leading dimension in this process is lld_T descinit_ ( DESCT, &k, &stripcols, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_T, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix Z returns info: %d\n",info ); return info; } // Allocation of memory for the strip of T' in all processes Tblock= ( double* ) calloc ( pTblocks*blocksize*blocksize, sizeof ( double ) ); if ( Tblock==NULL ) { printf ( "Error in allocating memory for a strip of Z in processor (%d,%d)",*position,* ( position+1 ) ); return -1; } // Initialisation of matrix D (all diagonal elements of D equal to lambda) temp=Dmat; for ( i=0,rowcur=0,colcur=0; i<Dblocks; ++i, ++colcur, ++rowcur ) { if ( rowcur==*dims ) { rowcur=0; temp += blocksize; } if ( colcur==* ( dims+1 ) ) { colcur=0; temp += blocksize*lld_D; } if ( *position==rowcur && * ( position+1 ) == colcur ) { for ( j=0; j<blocksize; ++j ) { * ( temp + j * lld_D +j ) =lambda; } if ( i==Dblocks-1 && Ddim % blocksize != 0 ) { for ( j=blocksize-1; j>= Ddim % blocksize; --j ) { * ( temp + j * lld_D + j ) =0.0; } } } } fT=fopen ( filenameT,"rb" ); if ( fT==NULL ) { printf ( "Error opening file\n" ); return -1; } // Set up of matrix D and B per strip of T' for ( ni=0; ni<nstrips; ++ni ) { if ( ni==nstrips-1 ) { if(Tblock != NULL) free ( Tblock ); Tblock=NULL; Tblock= ( double* ) calloc ( pTblocks*blocksize*blocksize, sizeof ( double ) ); if ( Tblock==NULL ) { printf ( "Error in allocating memory for a strip of Z in processor (%d,%d)\n",*position,* ( position+1 ) ); return -1; } } //Each process only reads in a part of the strip of T' //When k is not a multiple of blocksize, read-in of the last elements of the rows of T is tricky if ( ( nTblocks-1 ) % *dims == *position && k%blocksize !=0 ) { if ( ni==0 ) { info=fseek ( fT, ( long ) ( pcol * blocksize * ( k ) * sizeof ( double ) ),SEEK_SET ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } else { info=fseek ( fT, ( long ) ( blocksize * ( * ( dims+1 )-1 ) * ( k ) * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } for ( i=0; i<blocksize; ++i ) { info=fseek ( fT, ( long ) ( blocksize * *position * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } for ( j=0; j < pTblocks-1; ++j ) { fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT ); info=fseek ( fT, ( long ) ( ( ( *dims ) -1 ) * blocksize * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),k%blocksize,fT ); } //Normal read-in of the strips of T from a binary file (each time blocksize elements are read in) } else { if ( ni==0 ) { info=fseek ( fT, ( long ) ( pcol * blocksize * ( k ) * sizeof ( double ) ),SEEK_SET ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } else { info=fseek ( fT, ( long ) ( blocksize * ( * ( dims+1 )-1 ) * ( k ) * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } for ( i=0; i<blocksize; ++i ) { info=fseek ( fT, ( long ) ( blocksize * *position * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } for ( j=0; j < pTblocks-1; ++j ) { fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT ); info=fseek ( fT, ( long ) ( ( * ( dims )-1 ) * blocksize * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT ); info=fseek ( fT, ( long ) ( ( k - blocksize * ( ( pTblocks-1 ) * *dims + *position +1 ) ) * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } } blacs_barrier_ ( &ICTXT2D,"A" ); // End of read-in // Matrix D is the sum of the multiplications of all strips of T' by their transpose // Up unitl now, the entire matrix is stored, not only upper/lower triangular, which is possible since D is symmetric // Be aware, that you akways have to allocate memory for the enitre matrix, even when only dealing with the upper/lower triangular part pdgemm_ ( "N","T",&k,&k,&stripcols,&d_one, Tblock,&i_one, &i_one,DESCT, Tblock,&i_one, &i_one,DESCT, &d_one, Dmat, &i_one, &i_one, DESCD ); //Z'Z //pdsyrk_ ( "U","N",&k,&stripcols,&d_one, Tblock,&i_one, &i_one,DESCT, &d_one, Dmat, &t_plus, &t_plus, DESCD ); // Matrix B consists of X'T and Z'T, since each process only has some parts of T at its disposal, // we need to make sure that the correct columns of Z and X are multiplied with the correct columns of T. for ( i=0; i<pTblocks; ++i ) { XtT_temp.ncols=k; //This function multiplies the correct columns of X' with the blocks of T at the disposal of the process // The result is also stored immediately at the correct positions of X'T. (see src/tools.cpp) XtT_temp.clear(); mult_colsA_colsC ( Xtsparse, Tblock+i*blocksize, lld_T, ( * ( dims+1 ) * ni + pcol ) *blocksize, blocksize, ( *dims * i + *position ) *blocksize, blocksize, XtT_temp, 0 ); if ( XtT_temp.nonzeros>0 ) { if ( XtT_sparse.nonzeros==0 ){ XtT_sparse.clear(); XtT_sparse.make2 ( XtT_temp.nrows,XtT_temp.ncols,XtT_temp.nonzeros,XtT_temp.pRows,XtT_temp.pCols,XtT_temp.pData ); } else { XtT_sparse.addBCSR ( XtT_temp ); } } } //Same as above for calculating Z'T for ( i=0; i<pTblocks; ++i ) { ZtT_temp.ncols=k; ZtT_temp.clear(); mult_colsA_colsC ( Ztsparse, Tblock+i*blocksize, lld_T, ( * ( dims+1 ) * ni + pcol ) *blocksize, blocksize, blocksize * ( *dims * i + *position ), blocksize, ZtT_temp, 0 ); if ( ZtT_temp.nonzeros>0 ) { if ( ZtT_sparse.nonzeros==0 ){ ZtT_sparse.clear(); ZtT_sparse.make2 ( ZtT_temp.nrows,ZtT_temp.ncols,ZtT_temp.nonzeros,ZtT_temp.pRows,ZtT_temp.pCols,ZtT_temp.pData ); } else ZtT_sparse.addBCSR ( ZtT_temp ); } } blacs_barrier_ ( &ICTXT2D,"A" ); } XtT_temp.clear(); ZtT_temp.clear(); Xtsparse.clear(); Ztsparse.clear(); if(DESCT != NULL) free ( DESCT ); DESCT=NULL; if(Tblock != NULL) free ( Tblock ); Tblock=NULL; //printf("T read in\n"); info=fclose ( fT ); if ( info!=0 ) { printf ( "Error in closing open streams" ); return -1; } if(filenameT != NULL) free(filenameT); filenameT=NULL; //Each process only has calculated some parts of B //All parts are collected by the root process (iam==0), which assembles B //Each process then receives BT_i and B_j corresponding to the D_ij available to the process if ( iam!=0 ) { //Each process other than root sends its X' * T and Z' * T to the root process. MPI_Send ( & ( XtT_sparse.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD ); MPI_Send ( & ( XtT_sparse.pRows[0] ),XtT_sparse.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD ); MPI_Send ( & ( XtT_sparse.pCols[0] ),XtT_sparse.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD ); MPI_Send ( & ( XtT_sparse.pData[0] ),XtT_sparse.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD ); XtT_sparse.clear(); MPI_Send ( & ( ZtT_sparse.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD ); MPI_Send ( & ( ZtT_sparse.pRows[0] ),ZtT_sparse.nrows + 1, MPI_INT,0,4*size + iam,MPI_COMM_WORLD ); MPI_Send ( & ( ZtT_sparse.pCols[0] ),ZtT_sparse.nonzeros, MPI_INT,0,iam+ 5*size,MPI_COMM_WORLD ); MPI_Send ( & ( ZtT_sparse.pData[0] ),ZtT_sparse.nonzeros, MPI_DOUBLE,0,iam+6*size,MPI_COMM_WORLD ); ZtT_sparse.clear(); //printf("Process %d sent ZtT and XtT\n",iam); // And eventually receives the necessary BT_i and B_j // Blocking sends are used, which is why the order of the receives is critical depending on the coordinates of the process int nonzeroes; if (*position >= pcol) { MPI_Recv ( &nonzeroes,1,MPI_INT,0,iam,MPI_COMM_WORLD,&status ); BT_i.allocate ( blocksize*Drows,m+l,nonzeroes ); MPI_Recv ( & ( BT_i.pRows[0] ),blocksize*Drows + 1, MPI_INT,0,iam + size,MPI_COMM_WORLD,&status ); int count; MPI_Get_count(&status,MPI_INT,&count); BT_i.nrows=count-1; MPI_Recv ( & ( BT_i.pCols[0] ),nonzeroes, MPI_INT,0,iam+2*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( BT_i.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD,&status ); MPI_Recv ( &nonzeroes,1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD,&status ); B_j.allocate ( blocksize*Dcols,m+l,nonzeroes ); MPI_Recv ( & ( B_j.pRows[0] ),blocksize*Dcols + 1, MPI_INT,0,iam + 5*size,MPI_COMM_WORLD,&status ); MPI_Get_count(&status,MPI_INT,&count); B_j.nrows=count-1; MPI_Recv ( & ( B_j.pCols[0] ),nonzeroes, MPI_INT,0,iam+6*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( B_j.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD,&status ); //Actually BT_j is sent, so it still needs to be transposed B_j.transposeIt ( 1 ); } else { MPI_Recv ( &nonzeroes,1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD,&status ); B_j.allocate ( blocksize*Dcols,m+l,nonzeroes ); MPI_Recv ( & ( B_j.pRows[0] ),blocksize*Dcols + 1, MPI_INT,0,iam + 5*size,MPI_COMM_WORLD,&status ); int count; MPI_Get_count(&status,MPI_INT,&count); B_j.nrows=count-1; MPI_Recv ( & ( B_j.pCols[0] ),nonzeroes, MPI_INT,0,iam+6*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( B_j.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD,&status ); B_j.transposeIt ( 1 ); MPI_Recv ( &nonzeroes,1,MPI_INT,0,iam,MPI_COMM_WORLD,&status ); BT_i.allocate ( blocksize*Drows,m+l,nonzeroes ); MPI_Recv ( & ( BT_i.pRows[0] ),blocksize*Drows + 1, MPI_INT,0,iam + size,MPI_COMM_WORLD,&status ); MPI_Get_count(&status,MPI_INT,&count); BT_i.nrows=count-1; MPI_Recv ( & ( BT_i.pCols[0] ),nonzeroes, MPI_INT,0,iam+2*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( BT_i.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD,&status ); } } else { for ( i=1; i<size; ++i ) { // The root process receives parts of X' * T and Z' * T sequentially from all processes and directly adds them together. int nonzeroes; MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status ); if(nonzeroes>0) { XtT_temp.allocate ( m,k,nonzeroes ); MPI_Recv ( & ( XtT_temp.pRows[0] ),m + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( XtT_temp.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( XtT_temp.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status ); XtT_sparse.addBCSR ( XtT_temp ); XtT_temp.clear(); } MPI_Recv ( &nonzeroes,1, MPI_INT,i,i,MPI_COMM_WORLD,&status ); if(nonzeroes>0) { ZtT_temp.allocate ( l,k,nonzeroes ); MPI_Recv ( & ( ZtT_temp.pRows[0] ),l + 1, MPI_INT,i,4*size + i,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( ZtT_temp.pCols[0] ),nonzeroes, MPI_INT,i,i+ 5*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( ZtT_temp.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+6*size,MPI_COMM_WORLD,&status ); ZtT_sparse.addBCSR ( ZtT_temp ); ZtT_temp.clear(); } } XtT_sparse.transposeIt ( 1 ); ZtT_sparse.transposeIt ( 1 ); // B' is created by concatening blocks X'T and Z'T create1x2BlockMatrix ( XtT_sparse, ZtT_sparse,Btsparse ); XtT_sparse.clear(); ZtT_sparse.clear(); /*Btsparse.transposeIt(1); Btsparse.writeToFile("B_sparse.csr"); Btsparse.transposeIt(1);*/ // For each process row i BT_i is created which is also sent to processes in column i to become B_j. for ( int rowproc= *dims - 1; rowproc>= 0; --rowproc ) { BT_i.ncols=Btsparse.ncols; BT_i.nrows=0; BT_i.nonzeros=0; int Drows_rowproc; if (rowproc!=0) { Drows_rowproc= ( Dblocks - rowproc ) % *dims == 0 ? ( Dblocks- rowproc ) / *dims : ( Dblocks- rowproc ) / *dims +1; Drows_rowproc= Drows_rowproc<1? 1 : Drows_rowproc; } else Drows_rowproc=Drows; for ( i=0; i<Drows_rowproc; ++i ) { //Each process in row i can hold several blocks of contiguous rows of D for which we need the corresponding rows of B_T // Therefore we use the function extendrows to create BT_i (see src/tools.cpp) BT_i.extendrows ( Btsparse, ( i * *dims + rowproc ) * blocksize,blocksize ); } for ( int colproc= ( rowproc==0 ? 1 : 0 ); colproc < * ( dims+1 ); ++colproc ) { int rankproc; rankproc= blacs_pnum_ (&ICTXT2D, &rowproc,&colproc); MPI_Send ( & ( BT_i.nonzeros ),1, MPI_INT,rankproc,rankproc,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pRows[0] ),BT_i.nrows + 1, MPI_INT,rankproc,rankproc+size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pCols[0] ),BT_i.nonzeros, MPI_INT,rankproc,rankproc+2*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pData[0] ),BT_i.nonzeros, MPI_DOUBLE,rankproc,rankproc+3*size,MPI_COMM_WORLD ); //printf("BT_i's sent to processor %d\n",rankproc); rankproc= blacs_pnum_ (&ICTXT2D, &colproc,&rowproc); MPI_Send ( & ( BT_i.nonzeros ),1, MPI_INT,rankproc,rankproc+4*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pRows[0] ),BT_i.nrows + 1, MPI_INT,rankproc,rankproc+5*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pCols[0] ),BT_i.nonzeros, MPI_INT,rankproc,rankproc+6*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pData[0] ),BT_i.nonzeros, MPI_DOUBLE,rankproc,rankproc+7*size,MPI_COMM_WORLD ); //printf("B_j's sent to processor %d\n",rankproc); } } B_j.make2 ( BT_i.nrows,BT_i.ncols,BT_i.nonzeros,BT_i.pRows,BT_i.pCols,BT_i.pData ); B_j.transposeIt ( 1 ); } return 0; }
/*@C ISLocalToGlobalMappingGetInfo - Gets the neighbor information for each processor and each index shared by more than one processor Collective on ISLocalToGlobalMapping Input Parameters: . mapping - the mapping from local to global indexing Output Parameter: + nproc - number of processors that are connected to this one . proc - neighboring processors . numproc - number of indices for each subdomain (processor) - indices - indices of nodes (in local numbering) shared with neighbors (sorted by global numbering) Level: advanced Concepts: mapping^local to global Fortran Usage: $ ISLocalToGlobalMpngGetInfoSize(ISLocalToGlobalMapping,PetscInt nproc,PetscInt numprocmax,ierr) followed by $ ISLocalToGlobalMappingGetInfo(ISLocalToGlobalMapping,PetscInt nproc, PetscInt procs[nproc],PetscInt numprocs[nproc], PetscInt indices[nproc][numprocmax],ierr) There is no ISLocalToGlobalMappingRestoreInfo() in Fortran. You must make sure that procs[], numprocs[] and indices[][] are large enough arrays, either by allocating them dynamically or defining static ones large enough. .seealso: ISLocalToGlobalMappingDestroy(), ISLocalToGlobalMappingCreateIS(), ISLocalToGlobalMappingCreate(), ISLocalToGlobalMappingRestoreInfo() @*/ PetscErrorCode PETSCVEC_DLLEXPORT ISLocalToGlobalMappingGetInfo(ISLocalToGlobalMapping mapping,PetscInt *nproc,PetscInt *procs[],PetscInt *numprocs[],PetscInt **indices[]) { PetscErrorCode ierr; PetscMPIInt size,rank,tag1,tag2,tag3,*len,*source,imdex; PetscInt i,n = mapping->n,Ng,ng,max = 0,*lindices = mapping->indices; PetscInt *nprocs,*owner,nsends,*sends,j,*starts,nmax,nrecvs,*recvs,proc; PetscInt cnt,scale,*ownedsenders,*nownedsenders,rstart,nowned; PetscInt node,nownedm,nt,*sends2,nsends2,*starts2,*lens2,*dest,nrecvs2,*starts3,*recvs2,k,*bprocs,*tmp; PetscInt first_procs,first_numprocs,*first_indices; MPI_Request *recv_waits,*send_waits; MPI_Status recv_status,*send_status,*recv_statuses; MPI_Comm comm = ((PetscObject)mapping)->comm; PetscTruth debug = PETSC_FALSE; PetscFunctionBegin; PetscValidHeaderSpecific(mapping,IS_LTOGM_COOKIE,1); ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); if (size == 1) { *nproc = 0; *procs = PETSC_NULL; ierr = PetscMalloc(sizeof(PetscInt),numprocs);CHKERRQ(ierr); (*numprocs)[0] = 0; ierr = PetscMalloc(sizeof(PetscInt*),indices);CHKERRQ(ierr); (*indices)[0] = PETSC_NULL; PetscFunctionReturn(0); } ierr = PetscOptionsGetTruth(PETSC_NULL,"-islocaltoglobalmappinggetinfo_debug",&debug,PETSC_NULL);CHKERRQ(ierr); /* Notes on ISLocalToGlobalMappingGetInfo globally owned node - the nodes that have been assigned to this processor in global numbering, just for this routine. nontrivial globally owned node - node assigned to this processor that is on a subdomain boundary (i.e. is has more than one local owner) locally owned node - node that exists on this processors subdomain nontrivial locally owned node - node that is not in the interior (i.e. has more than one local subdomain */ ierr = PetscObjectGetNewTag((PetscObject)mapping,&tag1);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)mapping,&tag2);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)mapping,&tag3);CHKERRQ(ierr); for (i=0; i<n; i++) { if (lindices[i] > max) max = lindices[i]; } ierr = MPI_Allreduce(&max,&Ng,1,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr); Ng++; ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); scale = Ng/size + 1; ng = scale; if (rank == size-1) ng = Ng - scale*(size-1); ng = PetscMax(1,ng); rstart = scale*rank; /* determine ownership ranges of global indices */ ierr = PetscMalloc(2*size*sizeof(PetscInt),&nprocs);CHKERRQ(ierr); ierr = PetscMemzero(nprocs,2*size*sizeof(PetscInt));CHKERRQ(ierr); /* determine owners of each local node */ ierr = PetscMalloc(n*sizeof(PetscInt),&owner);CHKERRQ(ierr); for (i=0; i<n; i++) { proc = lindices[i]/scale; /* processor that globally owns this index */ nprocs[2*proc+1] = 1; /* processor globally owns at least one of ours */ owner[i] = proc; nprocs[2*proc]++; /* count of how many that processor globally owns of ours */ } nsends = 0; for (i=0; i<size; i++) nsends += nprocs[2*i+1]; ierr = PetscInfo1(mapping,"Number of global owners for my local data %d\n",nsends);CHKERRQ(ierr); /* inform other processors of number of messages and max length*/ ierr = PetscMaxSum(comm,nprocs,&nmax,&nrecvs);CHKERRQ(ierr); ierr = PetscInfo1(mapping,"Number of local owners for my global data %d\n",nrecvs);CHKERRQ(ierr); /* post receives for owned rows */ ierr = PetscMalloc((2*nrecvs+1)*(nmax+1)*sizeof(PetscInt),&recvs);CHKERRQ(ierr); ierr = PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr); for (i=0; i<nrecvs; i++) { ierr = MPI_Irecv(recvs+2*nmax*i,2*nmax,MPIU_INT,MPI_ANY_SOURCE,tag1,comm,recv_waits+i);CHKERRQ(ierr); } /* pack messages containing lists of local nodes to owners */ ierr = PetscMalloc((2*n+1)*sizeof(PetscInt),&sends);CHKERRQ(ierr); ierr = PetscMalloc((size+1)*sizeof(PetscInt),&starts);CHKERRQ(ierr); starts[0] = 0; for (i=1; i<size; i++) { starts[i] = starts[i-1] + 2*nprocs[2*i-2];} for (i=0; i<n; i++) { sends[starts[owner[i]]++] = lindices[i]; sends[starts[owner[i]]++] = i; } ierr = PetscFree(owner);CHKERRQ(ierr); starts[0] = 0; for (i=1; i<size; i++) { starts[i] = starts[i-1] + 2*nprocs[2*i-2];} /* send the messages */ ierr = PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);CHKERRQ(ierr); ierr = PetscMalloc((nsends+1)*sizeof(PetscInt),&dest);CHKERRQ(ierr); cnt = 0; for (i=0; i<size; i++) { if (nprocs[2*i]) { ierr = MPI_Isend(sends+starts[i],2*nprocs[2*i],MPIU_INT,i,tag1,comm,send_waits+cnt);CHKERRQ(ierr); dest[cnt] = i; cnt++; } } ierr = PetscFree(starts);CHKERRQ(ierr); /* wait on receives */ ierr = PetscMalloc((nrecvs+1)*sizeof(PetscMPIInt),&source);CHKERRQ(ierr); ierr = PetscMalloc((nrecvs+1)*sizeof(PetscMPIInt),&len);CHKERRQ(ierr); cnt = nrecvs; ierr = PetscMalloc((ng+1)*sizeof(PetscInt),&nownedsenders);CHKERRQ(ierr); ierr = PetscMemzero(nownedsenders,ng*sizeof(PetscInt));CHKERRQ(ierr); while (cnt) { ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr); /* unpack receives into our local space */ ierr = MPI_Get_count(&recv_status,MPIU_INT,&len[imdex]);CHKERRQ(ierr); source[imdex] = recv_status.MPI_SOURCE; len[imdex] = len[imdex]/2; /* count how many local owners for each of my global owned indices */ for (i=0; i<len[imdex]; i++) nownedsenders[recvs[2*imdex*nmax+2*i]-rstart]++; cnt--; } ierr = PetscFree(recv_waits);CHKERRQ(ierr); /* count how many globally owned indices are on an edge multiplied by how many processors own them. */ nowned = 0; nownedm = 0; for (i=0; i<ng; i++) { if (nownedsenders[i] > 1) {nownedm += nownedsenders[i]; nowned++;} } /* create single array to contain rank of all local owners of each globally owned index */ ierr = PetscMalloc((nownedm+1)*sizeof(PetscInt),&ownedsenders);CHKERRQ(ierr); ierr = PetscMalloc((ng+1)*sizeof(PetscInt),&starts);CHKERRQ(ierr); starts[0] = 0; for (i=1; i<ng; i++) { if (nownedsenders[i-1] > 1) starts[i] = starts[i-1] + nownedsenders[i-1]; else starts[i] = starts[i-1]; } /* for each nontrival globally owned node list all arriving processors */ for (i=0; i<nrecvs; i++) { for (j=0; j<len[i]; j++) { node = recvs[2*i*nmax+2*j]-rstart; if (nownedsenders[node] > 1) { ownedsenders[starts[node]++] = source[i]; } } } if (debug) { /* ----------------------------------- */ starts[0] = 0; for (i=1; i<ng; i++) { if (nownedsenders[i-1] > 1) starts[i] = starts[i-1] + nownedsenders[i-1]; else starts[i] = starts[i-1]; } for (i=0; i<ng; i++) { if (nownedsenders[i] > 1) { ierr = PetscSynchronizedPrintf(comm,"[%d] global node %d local owner processors: ",rank,i+rstart);CHKERRQ(ierr); for (j=0; j<nownedsenders[i]; j++) { ierr = PetscSynchronizedPrintf(comm,"%d ",ownedsenders[starts[i]+j]);CHKERRQ(ierr); } ierr = PetscSynchronizedPrintf(comm,"\n");CHKERRQ(ierr); } } ierr = PetscSynchronizedFlush(comm);CHKERRQ(ierr); }/* ----------------------------------- */ /* wait on original sends */ if (nsends) { ierr = PetscMalloc(nsends*sizeof(MPI_Status),&send_status);CHKERRQ(ierr); ierr = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr); ierr = PetscFree(send_status);CHKERRQ(ierr); } ierr = PetscFree(send_waits);CHKERRQ(ierr); ierr = PetscFree(sends);CHKERRQ(ierr); ierr = PetscFree(nprocs);CHKERRQ(ierr); /* pack messages to send back to local owners */ starts[0] = 0; for (i=1; i<ng; i++) { if (nownedsenders[i-1] > 1) starts[i] = starts[i-1] + nownedsenders[i-1]; else starts[i] = starts[i-1]; } nsends2 = nrecvs; ierr = PetscMalloc((nsends2+1)*sizeof(PetscInt),&nprocs);CHKERRQ(ierr); /* length of each message */ for (i=0; i<nrecvs; i++) { nprocs[i] = 1; for (j=0; j<len[i]; j++) { node = recvs[2*i*nmax+2*j]-rstart; if (nownedsenders[node] > 1) { nprocs[i] += 2 + nownedsenders[node]; } } } nt = 0; for (i=0; i<nsends2; i++) nt += nprocs[i]; ierr = PetscMalloc((nt+1)*sizeof(PetscInt),&sends2);CHKERRQ(ierr); ierr = PetscMalloc((nsends2+1)*sizeof(PetscInt),&starts2);CHKERRQ(ierr); starts2[0] = 0; for (i=1; i<nsends2; i++) starts2[i] = starts2[i-1] + nprocs[i-1]; /* Each message is 1 + nprocs[i] long, and consists of (0) the number of nodes being sent back (1) the local node number, (2) the number of processors sharing it, (3) the processors sharing it */ for (i=0; i<nsends2; i++) { cnt = 1; sends2[starts2[i]] = 0; for (j=0; j<len[i]; j++) { node = recvs[2*i*nmax+2*j]-rstart; if (nownedsenders[node] > 1) { sends2[starts2[i]]++; sends2[starts2[i]+cnt++] = recvs[2*i*nmax+2*j+1]; sends2[starts2[i]+cnt++] = nownedsenders[node]; ierr = PetscMemcpy(&sends2[starts2[i]+cnt],&ownedsenders[starts[node]],nownedsenders[node]*sizeof(PetscInt));CHKERRQ(ierr); cnt += nownedsenders[node]; } } } /* receive the message lengths */ nrecvs2 = nsends; ierr = PetscMalloc((nrecvs2+1)*sizeof(PetscInt),&lens2);CHKERRQ(ierr); ierr = PetscMalloc((nrecvs2+1)*sizeof(PetscInt),&starts3);CHKERRQ(ierr); ierr = PetscMalloc((nrecvs2+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr); for (i=0; i<nrecvs2; i++) { ierr = MPI_Irecv(&lens2[i],1,MPIU_INT,dest[i],tag2,comm,recv_waits+i);CHKERRQ(ierr); } /* send the message lengths */ for (i=0; i<nsends2; i++) { ierr = MPI_Send(&nprocs[i],1,MPIU_INT,source[i],tag2,comm);CHKERRQ(ierr); } /* wait on receives of lens */ if (nrecvs2) { ierr = PetscMalloc(nrecvs2*sizeof(MPI_Status),&recv_statuses);CHKERRQ(ierr); ierr = MPI_Waitall(nrecvs2,recv_waits,recv_statuses);CHKERRQ(ierr); ierr = PetscFree(recv_statuses);CHKERRQ(ierr); } ierr = PetscFree(recv_waits); starts3[0] = 0; nt = 0; for (i=0; i<nrecvs2-1; i++) { starts3[i+1] = starts3[i] + lens2[i]; nt += lens2[i]; } nt += lens2[nrecvs2-1]; ierr = PetscMalloc((nt+1)*sizeof(PetscInt),&recvs2);CHKERRQ(ierr); ierr = PetscMalloc((nrecvs2+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr); for (i=0; i<nrecvs2; i++) { ierr = MPI_Irecv(recvs2+starts3[i],lens2[i],MPIU_INT,dest[i],tag3,comm,recv_waits+i);CHKERRQ(ierr); } /* send the messages */ ierr = PetscMalloc((nsends2+1)*sizeof(MPI_Request),&send_waits);CHKERRQ(ierr); for (i=0; i<nsends2; i++) { ierr = MPI_Isend(sends2+starts2[i],nprocs[i],MPIU_INT,source[i],tag3,comm,send_waits+i);CHKERRQ(ierr); } /* wait on receives */ if (nrecvs2) { ierr = PetscMalloc(nrecvs2*sizeof(MPI_Status),&recv_statuses);CHKERRQ(ierr); ierr = MPI_Waitall(nrecvs2,recv_waits,recv_statuses);CHKERRQ(ierr); ierr = PetscFree(recv_statuses);CHKERRQ(ierr); } ierr = PetscFree(recv_waits);CHKERRQ(ierr); ierr = PetscFree(nprocs);CHKERRQ(ierr); if (debug) { /* ----------------------------------- */ cnt = 0; for (i=0; i<nrecvs2; i++) { nt = recvs2[cnt++]; for (j=0; j<nt; j++) { ierr = PetscSynchronizedPrintf(comm,"[%d] local node %d number of subdomains %d: ",rank,recvs2[cnt],recvs2[cnt+1]);CHKERRQ(ierr); for (k=0; k<recvs2[cnt+1]; k++) { ierr = PetscSynchronizedPrintf(comm,"%d ",recvs2[cnt+2+k]);CHKERRQ(ierr); } cnt += 2 + recvs2[cnt+1]; ierr = PetscSynchronizedPrintf(comm,"\n");CHKERRQ(ierr); } } ierr = PetscSynchronizedFlush(comm);CHKERRQ(ierr); } /* ----------------------------------- */ /* count number subdomains for each local node */ ierr = PetscMalloc(size*sizeof(PetscInt),&nprocs);CHKERRQ(ierr); ierr = PetscMemzero(nprocs,size*sizeof(PetscInt));CHKERRQ(ierr); cnt = 0; for (i=0; i<nrecvs2; i++) { nt = recvs2[cnt++]; for (j=0; j<nt; j++) { for (k=0; k<recvs2[cnt+1]; k++) { nprocs[recvs2[cnt+2+k]]++; } cnt += 2 + recvs2[cnt+1]; } } nt = 0; for (i=0; i<size; i++) nt += (nprocs[i] > 0); *nproc = nt; ierr = PetscMalloc((nt+1)*sizeof(PetscInt),procs);CHKERRQ(ierr); ierr = PetscMalloc((nt+1)*sizeof(PetscInt),numprocs);CHKERRQ(ierr); ierr = PetscMalloc((nt+1)*sizeof(PetscInt*),indices);CHKERRQ(ierr); ierr = PetscMalloc(size*sizeof(PetscInt),&bprocs);CHKERRQ(ierr); cnt = 0; for (i=0; i<size; i++) { if (nprocs[i] > 0) { bprocs[i] = cnt; (*procs)[cnt] = i; (*numprocs)[cnt] = nprocs[i]; ierr = PetscMalloc(nprocs[i]*sizeof(PetscInt),&(*indices)[cnt]);CHKERRQ(ierr); cnt++; } } /* make the list of subdomains for each nontrivial local node */ ierr = PetscMemzero(*numprocs,nt*sizeof(PetscInt));CHKERRQ(ierr); cnt = 0; for (i=0; i<nrecvs2; i++) { nt = recvs2[cnt++]; for (j=0; j<nt; j++) { for (k=0; k<recvs2[cnt+1]; k++) { (*indices)[bprocs[recvs2[cnt+2+k]]][(*numprocs)[bprocs[recvs2[cnt+2+k]]]++] = recvs2[cnt]; } cnt += 2 + recvs2[cnt+1]; } } ierr = PetscFree(bprocs);CHKERRQ(ierr); ierr = PetscFree(recvs2);CHKERRQ(ierr); /* sort the node indexing by their global numbers */ nt = *nproc; for (i=0; i<nt; i++) { ierr = PetscMalloc(((*numprocs)[i])*sizeof(PetscInt),&tmp);CHKERRQ(ierr); for (j=0; j<(*numprocs)[i]; j++) { tmp[j] = lindices[(*indices)[i][j]]; } ierr = PetscSortIntWithArray((*numprocs)[i],tmp,(*indices)[i]);CHKERRQ(ierr); ierr = PetscFree(tmp);CHKERRQ(ierr); } if (debug) { /* ----------------------------------- */ nt = *nproc; for (i=0; i<nt; i++) { ierr = PetscSynchronizedPrintf(comm,"[%d] subdomain %d number of indices %d: ",rank,(*procs)[i],(*numprocs)[i]);CHKERRQ(ierr); for (j=0; j<(*numprocs)[i]; j++) { ierr = PetscSynchronizedPrintf(comm,"%d ",(*indices)[i][j]);CHKERRQ(ierr); } ierr = PetscSynchronizedPrintf(comm,"\n");CHKERRQ(ierr); } ierr = PetscSynchronizedFlush(comm);CHKERRQ(ierr); } /* ----------------------------------- */ /* wait on sends */ if (nsends2) { ierr = PetscMalloc(nsends2*sizeof(MPI_Status),&send_status);CHKERRQ(ierr); ierr = MPI_Waitall(nsends2,send_waits,send_status);CHKERRQ(ierr); ierr = PetscFree(send_status);CHKERRQ(ierr); } ierr = PetscFree(starts3);CHKERRQ(ierr); ierr = PetscFree(dest);CHKERRQ(ierr); ierr = PetscFree(send_waits);CHKERRQ(ierr); ierr = PetscFree(nownedsenders);CHKERRQ(ierr); ierr = PetscFree(ownedsenders);CHKERRQ(ierr); ierr = PetscFree(starts);CHKERRQ(ierr); ierr = PetscFree(starts2);CHKERRQ(ierr); ierr = PetscFree(lens2);CHKERRQ(ierr); ierr = PetscFree(source);CHKERRQ(ierr); ierr = PetscFree(len);CHKERRQ(ierr); ierr = PetscFree(recvs);CHKERRQ(ierr); ierr = PetscFree(nprocs);CHKERRQ(ierr); ierr = PetscFree(sends2);CHKERRQ(ierr); /* put the information about myself as the first entry in the list */ first_procs = (*procs)[0]; first_numprocs = (*numprocs)[0]; first_indices = (*indices)[0]; for (i=0; i<*nproc; i++) { if ((*procs)[i] == rank) { (*procs)[0] = (*procs)[i]; (*numprocs)[0] = (*numprocs)[i]; (*indices)[0] = (*indices)[i]; (*procs)[i] = first_procs; (*numprocs)[i] = first_numprocs; (*indices)[i] = first_indices; break; } } PetscFunctionReturn(0); }
//M+ void mp( int MinCoreSize, int MaxCoreSize, int SamplingFreq, int NumReplicates, char* OutFilePath, std::string Kernel, vector<int> KernelAccessionIndex, vector<int> AccessionNameList, vector<vector<vector<int> > > ActiveAlleleByPopList, vector<vector<vector<int> > > TargetAlleleByPopList, vector<int> ActiveMaxAllelesList, vector<int> TargetMaxAllelesList, vector<std::string> FullAccessionNameList ) { //PERFORM INITIAL MPI STUFF MPI_Status status; //this struct contains three fields which will contain info about the sender of a received message // MPI_SOURCE, MPI_TAG, MPI_ERROR //MPI::Init (); //Initialize MPI. int nproc = MPI::COMM_WORLD.Get_size ( ); //Get the number of processes. int procid = MPI::COMM_WORLD.Get_rank ( ); //Get the individual process ID. //set up vectors to fill with results //below is a stupid way to calculate the number of rows in the output file, value l (which = V1) //used to monitor progress and as the maximum vector index for shared output vectors int l=0; for (int i=MinCoreSize;i<MaxCoreSize+1;i=i+SamplingFreq) { for (int j=0;j<NumReplicates;j++) { l++; } } double V1 = (double)l; //(MaxCoreSize - MinCoreSize + 1)*NumReplicates; //number of rows in output vectors vector<vector<double> > Results(V1, vector<double>(9)); //will contain numerical results vector<vector<string> > Members(V1); //will contain core set members //***MPI: RECEIVE RESULTS AT MASTER 0 //receive values from any slave, in any order, exiting when the number of 'receives' = the top vector size if ( procid == 0 ) { //set up variables for monitoring progress int percent; //percent of analysis completed int progindex = 0; //index to monitor progress, percent = 100*(progindex/l) //receive and process results from slave processors unsigned int i = 0; while (i<2*(Results.size())) //two receives per row { //probe the incoming message to determine its tag int nchar; //will contain the length of the char array passed with tag=1 int vchar; //will contain the length of the vector passed with tag=0 int tag; //tag of message from sender int source; //procid of sender MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); //MPI_Get_count(&status, MPI_CHAR, &nchar); //probes the length of the message, saves it in nchar tag = status.MPI_TAG; //the tag defines which kind of comm it is, a vector of stats (0=resvec()) //or a char array describing the members of the core (1=cc) source = status.MPI_SOURCE; //determine the source of the message so that you can define which sender to Recv from. This will avoid an intervening message coming in after the MPI_Probe with a different length, causing a message truncated error. if (tag == 0) { //determine the length of the message tagged 0 MPI_Get_count(&status, MPI_DOUBLE, &vchar); //cout <<" vchar="<<vchar<<" tag="<<tag<<" MPI_SOURCE="<<status.MPI_SOURCE<<" MPI_ERROR="<<status.MPI_ERROR<<"\n"; //receive the vector of results, tagged 0, from: //MPI_Send(&resvec[0], resvec.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); vector<double> t(10); MPI_Recv(&t[0], vchar, MPI_DOUBLE, source, 0, MPI_COMM_WORLD, &status); //load data from vector received onto Results, row number is last item t[9] for (int j=0;j<9;++j) { Results[ t[9] ][j] = t[j]; } t.clear(); } else if (tag == 1) { //determine the length of the message tagged 1 MPI_Get_count(&status, MPI_CHAR, &nchar); //probes the length of the message, saves it in nchar //cout <<" nchar="<<nchar<<" tag="<<tag<<" MPI_SOURCE="<<status.MPI_SOURCE<<" MPI_ERROR="<<status.MPI_ERROR<<"\n"; //receive the vector<string> of the core set, tagged 1, from: //MPI_Send(&m[0], nchar, MPI_CHAR, 0, 1, MPI_COMM_WORLD); //vector<string> m(nchar); char m[nchar]; MPI_Recv(&m[0], nchar, MPI_CHAR, source, 1, MPI_COMM_WORLD, &status); //load core set onto Members //1. convert char array into a string string mstr(m); //2. split string on delimiter ',<!>,' string delim = ",<!>,"; vector<string> mvec( countSubstring(mstr, delim) ); unsigned int st = 0; std::size_t en = mstr.find(delim); int k = 0; while (en != std::string::npos) { mvec[k] = mstr.substr(st, en-st); st = en + delim.length(); en = mstr.find(delim,st); ++k; } string z = mstr.substr(st); //get row number as last item in mstr int zz = atoi(z.c_str()); //convert string to c-string then to int //3. load onto Members Members[zz] = mvec; //4. clean up memset(m, 0, nchar);; mstr=""; mvec.clear(); } ++i; //display progress progindex = progindex + 1; percent = 100*( progindex/(V1*2) ); //number of rows X 2 repeats needed to complete search printProgBar(percent); } }//***MPI: END MASTER RECEIVE***/ /***MPI: SEND RESULTS FROM SLAVE PROCESSES***/ else if ( procid != 0 ) { unsigned int r; //r = core size, //int nr, RandAcc, b, bsc, plateau; //nr = controller to repeat NumReplicates times int RandAcc, b, bsc, plateau; //nr = controller to repeat NumReplicates times //row = result vector row number, bsc = holds best sub core member, and other indexed accessions //plateau = index of the number of reps in optimization loop with same diversity value double RandomActiveDiversity; double AltRandomActiveDiversity; double StartingRandomActiveDiversity; double StartingAltRandomActiveDiversity; double RandomTargetDiversity; double AltRandomTargetDiversity; double StartingDiversity; double TempAltOptimizedActiveDiversity; double AltOptimizedActiveDiversity; double OptimizedTargetDiversity; double AltOptimizedTargetDiversity; double best; double nnew; vector<vector<vector<int> > > AlleleList; vector<vector<vector<int> > > CoreAlleles; vector<vector<vector<int> > > TdTempList; vector<vector<vector<int> > > BestSubCoreAlleles; std::string Standardize = "yes"; //a run that mimics the MSTRAT approach can be accomplished by setting Standardize="no", and setting up the var file so that each column in the .dat file is treated as a single locus, rather than two (or more) adjacent columns being treated as a single codominant locus. vector<int> AccessionsInCore; vector<int> AccessionsInSubCore; vector<int> BestSubCore; vector<int> BestSubCoreRevSorted; vector<int> TempList; vector<int> TempList2; vector<int> bestcore; vector<std::string> TempListStr; //seed the random number generator for each processor int tt; tt = (time(NULL)); srand ( abs(((tt*181)*((procid-83)*359))%104729) ); //do parallelization so that each rep by core size combo can be //handled by a distinct thread. this involves figuring out the total //number of reps*coresizes taking into account the SamplingFreq int rsteps = 1 + floor( (MaxCoreSize - MinCoreSize) / SamplingFreq ); //number of steps from MinCoreSize to MaxCoreSize //***MPI: figure out where to start and stop loop for each processor int nreps = rsteps*NumReplicates; int count = nreps/(nproc-1); //p-1 assumes a master, i.e. one less processor than total int start = (procid-1) * count; //procid-1 makes you start at 0, assumes master is p0 int stop; if (nreps % (nproc-1) > (procid-1)) { start += procid - 1; stop = start + (count + 1); } else { start += nreps % (nproc-1); stop = start + count; } //iterate thru the relevant rows for (int rnr=start;rnr<stop;++rnr) { r = MinCoreSize + ((rnr / NumReplicates) * SamplingFreq); //int rounds to floor //develop random starting core set //clear AccessionsInCore and set size AccessionsInCore.clear(); AccessionsInCore.resize(r); //add kernel accessions to core, if necessary if (Kernel == "yes") { for (unsigned int i=0;i<KernelAccessionIndex.size();i++) { AccessionsInCore[i] = KernelAccessionIndex[i]; } } //clear TempList and set size TempList.clear(); TempList.resize( AccessionNameList.size() ); //set list of available accessions in TempList, by erasing those already in the core TempList = AccessionNameList; //expunge the kernel accessions, so they are not available for random addition below //KernelAccessionIndex has been reverse sorted so you don't go outside range after automatic resize by .erase for (unsigned int i=0;i<KernelAccessionIndex.size();i++) { b = KernelAccessionIndex[i]; TempList.erase(TempList.begin()+b); } //randomly add accessions until r accessions are in the core. if there is a kernel, include those (done above) //plus additional, randomly selected accessions, until you get r accessions for (unsigned int i=KernelAccessionIndex.size();i<r;i++) { //choose an accession randomly from those available RandAcc = rand() % TempList.size(); //add it to the list AccessionsInCore[i] = TempList[RandAcc]; //remove it from the list of available accessions TempList.erase(TempList.begin()+RandAcc); } //assemble genotypes for random core and calculate diversity //1. put together initial list of active alleles CoreAlleles.clear(); CoreAlleles.resize( AccessionsInCore.size() ); for (unsigned int i=0;i<AccessionsInCore.size();i++) { b = AccessionsInCore[i]; CoreAlleles[i] = ActiveAlleleByPopList[b]; } //2. calculate diversity from random selection at active loci AlleleList.clear(); AlleleList = CoreAlleles; MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, RandomActiveDiversity, AltRandomActiveDiversity); //in MyCalculateDiversity, latter two variables are updated as references //save them away in non-updated variables StartingRandomActiveDiversity = RandomActiveDiversity; StartingAltRandomActiveDiversity = AltRandomActiveDiversity; //3. calculate diversity from random selection at target loci AlleleList.clear(); AlleleList.resize( AccessionsInCore.size() ); for (unsigned int j=0;j<AccessionsInCore.size();j++) { b = AccessionsInCore[j]; AlleleList[j] = TargetAlleleByPopList[b]; } MyCalculateDiversity(AlleleList, TargetMaxAllelesList, Standardize, RandomTargetDiversity, AltRandomTargetDiversity); //BEGIN OPTIMIZATION StartingDiversity = 0; //this is the diversity recovered during the prior iteration. plateau = 0; //count of the number of times you have found the best value, evaluates when you are //stuck on a plateau, assuming acceptance criterion allows downhill steps //this is the iterations step, now an indefinite loop that is broken when //no improvement is made during the course of the optimization algorithm //If r = kernel size = MinCoreSize then do no optimization but still calculate all variables. if (KernelAccessionIndex.size() == r) { //assemble genotypes for core //1. put together initial list CoreAlleles.clear(); CoreAlleles.resize(r); for (unsigned int i=0;i<r;i++) { b = AccessionsInCore[i]; CoreAlleles[i] = ActiveAlleleByPopList[b]; } AlleleList = CoreAlleles; MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, RandomActiveDiversity, AltRandomActiveDiversity); best = RandomActiveDiversity; //best is equivalent to OptimizedActiveDiversity AltOptimizedActiveDiversity = AltRandomActiveDiversity; } else { //do optimization while ( true ) { //assemble genotypes for core //1. put together initial list CoreAlleles.clear(); CoreAlleles.resize(r); for (unsigned int i=0;i<r;i++) { b = AccessionsInCore[i]; CoreAlleles[i] = ActiveAlleleByPopList[b]; } //2. go through all possible subsets of size r-1, one at a time, noting which is best. //If there is a kernel, do not swap out any of those accessions (they are retained as the //first KernelAccessionIndex.size() items in CoreAlleles). Accomplished by starting for loop //at KernelAccessionIndex.size(). best=0; for (unsigned int i=KernelAccessionIndex.size();i<CoreAlleles.size();i++) { //remove each item consecutively from the list of all populations in the core AlleleList.clear(); TdTempList.clear(); TdTempList = CoreAlleles; //swap to temporary vector TdTempList.erase( TdTempList.begin() + i); AlleleList = TdTempList; TempList2.clear(); TempList2 = AccessionsInCore; TempList2.erase(TempList2.begin() + i); AccessionsInSubCore = TempList2; /*Data structure for SubCoreAlleles: SubCore 1..r Population 1..(r-1) AlleleArray 1..NumLoci --3. fuse alleles from the same locus into a single array, for all accessions, for the current subcore --4. assemble a list of diversity (M) for each locus separately --5. standardize the M values to the maximum possible number of alleles at that locus, and add them up to get final estimate of standardized allelic diversity in the core. then divide by the number of loci to get a number that is comparable across data sets. --5.5. simultaneous to the calculation, keep track of which subcore is best */ MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, RandomActiveDiversity, AltRandomActiveDiversity); nnew = RandomActiveDiversity; if (nnew >= best) // >= allows sideways movement during hill climbing { best = nnew; BestSubCore.clear(); BestSubCore = AccessionsInSubCore; BestSubCoreAlleles.clear(); BestSubCoreAlleles = AlleleList; } } //for loop cycles thru all subcores //reverse sort BestSubCore to support easy assembly of pared TempList below BestSubCoreRevSorted = BestSubCore; std::sort(BestSubCoreRevSorted.begin(), BestSubCoreRevSorted.end(), std::greater<int>()); /* 6. take the subcore with greatest diversity and consecutively add each possible additional accession from the base collection. find the core of size r (not r-1 subcore) that has the greatest diversity. suppress the IDs of those accessions found in the BestSubCore from the list of all accessions to get a list of remaining accessions.*/ TempList = AccessionNameList; for (unsigned int k=0;k<BestSubCoreRevSorted.size();k++) { bsc = BestSubCoreRevSorted[k]; TempList.erase( TempList.begin() + bsc ); } //shuffle the list of remaining accessions, so addition order is not predictable std::random_shuffle (TempList.begin(), TempList.end()); //add each remaining accession consecutively, calculate diversity, test //whether it is better than the prior one best = 0; for (unsigned int k=0;k<TempList.size();k++) { bsc = TempList[k]; //define the core TempList2 = BestSubCore; TempList2.resize( TempList2.size() + 1 ); //TempList2.push_back(i); TempList2[TempList2.size()-1] = bsc; //add new accession to last vector element AccessionsInCore = TempList2; //assemble the allelelist for the core TdTempList = BestSubCoreAlleles; TdTempList.resize( TdTempList.size() + 1 ); //TdTempList.push_back( ActiveAlleleByPopList[i] ); TdTempList[TdTempList.size()-1] = ActiveAlleleByPopList[bsc]; AlleleList = TdTempList; //calculate diversity MyCalculateDiversity(AlleleList, ActiveMaxAllelesList, Standardize, nnew, TempAltOptimizedActiveDiversity); //test whether current diversity is higher than the best diversity found so far if (nnew >= best) // >= allows sideways movement during hill climbing { best = nnew; bestcore = AccessionsInCore; //save the alternative diversity value for the best core AltOptimizedActiveDiversity = TempAltOptimizedActiveDiversity; } } AccessionsInCore = bestcore; //define starting variable for next MSTRAT iteration //if there has been no improvement from the prior iteration, you have reached // the plateau and should exit the repeat if (best == StartingDiversity) { plateau++; if (plateau > 0) break; } //update starting value and repeat else if (best > StartingDiversity) StartingDiversity = best; } //while(true) endless loop } //7. Calculate diversity at target loci //assemble the target loci allelelist for the accessions in the best core AlleleList.clear(); AlleleList.resize( AccessionsInCore.size() ); for (unsigned int j=0;j<AccessionsInCore.size();j++) { b = AccessionsInCore[j]; AlleleList[j] = TargetAlleleByPopList[b]; } //calculate diversity at target loci based upon the optimized core selection MyCalculateDiversity(AlleleList, TargetMaxAllelesList, Standardize, OptimizedTargetDiversity, AltOptimizedTargetDiversity); //8. Assemble stats for optimized core and add to output vectors //create a list of accession names from the list of accession ID's in AccessionsInCore sort( AccessionsInCore.begin(), AccessionsInCore.end() ); TempListStr.clear(); TempListStr.resize(r); for (unsigned int i=0;i<AccessionsInCore.size();i++) { b = AccessionsInCore[i]; TempListStr[i] = FullAccessionNameList[b]; } /***MPI: BUILD & SEND RESULTS VECTOR***/ //load the variables onto the results vectors //no need to calculate row number, it is the same as rnr, formula saved because it might be useful later //row = ((r - MinCoreSize)*NumReplicates) + nr - ( (NumReplicates*(SamplingFreq-1))*( (r-MinCoreSize)/SamplingFreq ) ); // (r - MinCoreSize)*NumReplicates) + nr specifies row number if SamplingFreq=1 // (NumReplicates*(SamplingFreq-1)) specifies a step value to correct when SamplingFreq>1 // ( (r-MinCoreSize)/SamplingFreq ) specifies the replicate on core size, accounting for SamplingFreq // see file Calculation of row value.xlsx for development of the 'row' index //put results 0-8 into a vector, resvec, return row as last item vector<double> resvec(10); resvec[0] = double(r); resvec[1] = StartingRandomActiveDiversity;//RandomActiveDiversity; resvec[2] = best; //equivalent to OptimizedActiveDiversity resvec[3] = RandomTargetDiversity; resvec[4] = OptimizedTargetDiversity; resvec[5] = StartingAltRandomActiveDiversity;//AltRandomActiveDiversity; resvec[6] = AltOptimizedActiveDiversity; resvec[7] = AltRandomTargetDiversity; resvec[8] = AltOptimizedTargetDiversity; resvec[9] = double(rnr); //cout<<"MPI_Rank="<<MPI_Rank<<" //send result vector to master 0, send row number, rnr, as last element. //message is tagged as 0 //here you are pointing to the first element, then returning resvec.size() doubles- //worth of memory from that starting location. MPI_Send(&resvec[0], resvec.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); /***MPI: END BUILD & SEND RESULTS VECTOR***/ /***MPI: BUILD & SEND MEMBERS VECTOR***/ //add row number as last item in TempListStr TempListStr.resize(TempListStr.size()+1); stringstream ss; ss << rnr; //convert int to stringstream to string TempListStr[ TempListStr.size() - 1 ] = ss.str(); //convert vector<string> to a single, ',<!>,' delimited, string string concat; for (unsigned int i=0;i<TempListStr.size();++i) { concat += TempListStr[i]; //add vector element if (i<TempListStr.size()-1) concat += ",<!>,"; //add delimiter, except for last item } //convert the string to a char array char cc[concat.size()+1]; strcpy(cc, concat.c_str()); //send the char array to master0 tagged as 1 //tagged as 1 to distinguish from result vector send MPI_Send(&cc, sizeof(cc), MPI_CHAR, 0, 1, MPI_COMM_WORLD); } //end for loop over rows } //***MPI: END SEND /*MPI: MASTER 0 WRITES OUTPUT*/ if ( procid == 0 ) { //set up file stream for output file ofstream output; output.open(OutFilePath); output.close(); //quick open close done to clear any existing file each time program is run output.open(OutFilePath, ios::out | ios::app); //open file in append mode output << "core size random reference diversity optimized reference diversity random target diversity optimized target diversity alt random reference diversity alt optimized reference diversity alt random target diversity alt optimized target diversity core members" << "\n"; //write out results row by row for (int i=0;i<V1;i++) { //write variables output << Results[i][0] << " " << Results[i][1] << " " << Results[i][2] << " " << Results[i][3] << " " << Results[i][4] << " " << Results[i][5] << " " << Results[i][6] << " " << Results[i][7] << " " << Results[i][8] << " " << "("; //write Accessions retained for (unsigned int j=0;j<Members[i].size();j++) { if ( j==(Members[i].size() - 1) ) { //add trailing parentheses and move to next row output << Members[i][j] << ")\n"; } else { output << Members[i][j] << ","; } } } //wrap up write step output.close(); } /***MPI: END MASTER WRITE***/ //Terminate MPI. //MPI::Finalize ( ); }
void update_particle_ghosts( int** ll_addr, int*** hoc, pp3mg_particle** particles_addr, double x, double y, double z, int m, int n, int o, int ghosts, int* n_stored_particles, int* max_particles, int m_start, int m_end, int n_start, int n_end, int o_start, int o_end, MPI_Comm mpi_comm_cart ) { /* Local variables */ pp3mg_particle* particles; int* ll; /* Variables for MPI */ int mpi_self; int mpi_left, mpi_right, mpi_lower, mpi_upper, mpi_back, mpi_front; int mpi_count; int mpi_blockcounts[2]; MPI_Aint mpi_offsets[2]; MPI_Request mpi_req[2]; MPI_Status mpi_stat[2]; MPI_Datatype mpi_type_particle, mpi_oldtypes[2]; /* Other variables */ int count, p, start; int i, j, k; /* Initializing MPI variables */ MPI_Comm_rank( mpi_comm_cart, &mpi_self ); MPI_Cart_shift( mpi_comm_cart, 0, 1, &mpi_left, &mpi_right ); MPI_Cart_shift( mpi_comm_cart, 1, 1, &mpi_lower, &mpi_upper ); MPI_Cart_shift( mpi_comm_cart, 2, 1, &mpi_back, &mpi_front ); mpi_req[0] = MPI_REQUEST_NULL; mpi_req[1] = MPI_REQUEST_NULL; /* Creating particle type for MPI */ mpi_blockcounts[0] = 8; mpi_offsets[0] = 0; mpi_oldtypes[0] = MPI_DOUBLE; MPI_Type_struct( 1, mpi_blockcounts, mpi_offsets, mpi_oldtypes, &mpi_type_particle ); MPI_Type_commit( &mpi_type_particle ); particles = *particles_addr; ll = *ll_addr; /* -------------------------------------------------------------------------- * * Sending particles to right neighbor * * -------------------------------------------------------------------------- */ count = 0; for( i = m_end-m_start+1; i <= m_end-m_start+ghosts; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ count++; p = ll[p]; } } #ifdef DEBUG printf("Rank %d: n_stored_particles = %d, max_particles = %d, count = %d\n",mpi_self,*n_stored_particles,*max_particles,count); #endif start = *max_particles - count; while( start <= ( *n_stored_particles + 27*count ) ) { *max_particles = *max_particles*2; *particles_addr = (pp3mg_particle*) realloc(particles,*max_particles*sizeof(pp3mg_particle)); *ll_addr = (int*) realloc(ll,*max_particles*sizeof(int)); if (*particles_addr == NULL || *ll_addr == NULL) { printf("Realloc failed!"); exit(1); } else { particles = *particles_addr; ll = *ll_addr; for (int i=*max_particles>>1; i<*max_particles; i++) ll[i] = -1; start = *max_particles - count; #ifdef DEBUG printf("Rank %d: Reallocated. Now max_particles = %d\n",mpi_self,*max_particles); #endif } } count = 0; /* If at one end, shift particles */ if( m_end == (m-1) ){ for( i = m_end-m_start+1; i <= m_end-m_start+ghosts; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count] = particles[p]; particles[start+count].x = particles[p].x - x; count++; p = ll[p]; } } } else{ for( i = m_end-m_start+1; i <= m_end-m_start+ghosts; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count] = particles[p]; count++; p = ll[p]; } } } if( mpi_right == mpi_self ){ /* if( periodic ){ */ for( p = 0; p < count; p++ ){ particles[*n_stored_particles+p] = particles[start+p]; } *n_stored_particles += count; /* } */ } else{ MPI_Isend( &particles[start], count, mpi_type_particle, mpi_right, 1, mpi_comm_cart, &mpi_req[0] ); MPI_Irecv( &particles[*n_stored_particles], *max_particles-*n_stored_particles, mpi_type_particle, mpi_left, 1, mpi_comm_cart, &mpi_req[1] ); MPI_Waitall( 2, mpi_req, mpi_stat ); MPI_Get_count( &mpi_stat[1], mpi_type_particle, &mpi_count ); count = mpi_count; *n_stored_particles += count; } if( *n_stored_particles >= start ) { printf("Buffer too small!\n"); exit(1); } /* Updating linked list */ if( count > 0 ) update_linked_list( ll, hoc, particles,*n_stored_particles-count, *n_stored_particles, x, y, z, m, n, o, m_start-ghosts, m_end+ghosts, n_start-ghosts, n_end+ghosts, o_start-ghosts, o_end+ghosts ); /* -------------------------------------------------------------------------- * * Sending particles to left neighbor * * -------------------------------------------------------------------------- */ count = 0; for( i = ghosts; i <= 2*ghosts-1; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ count++; p = ll[p]; } } #ifdef DEBUG printf("Rank %d: n_stored_particles = %d, max_particles = %d, count = %d\n",mpi_self,*n_stored_particles,*max_particles,count); #endif start = *max_particles - count; while( start <= ( *n_stored_particles + 27*count ) ) { *max_particles = *max_particles*2; *particles_addr = (pp3mg_particle*) realloc(particles,*max_particles*sizeof(pp3mg_particle)); *ll_addr = (int*) realloc(ll,*max_particles*sizeof(int)); if (*particles_addr == NULL || *ll_addr == NULL) { printf("Realloc failed!"); exit(1); } else { particles = *particles_addr; ll = *ll_addr; for (int i=*max_particles>>1; i<*max_particles; i++) ll[i] = -1; start = *max_particles - count; #ifdef DEBUG printf("Rank %d: Reallocated. Now max_particles = %d\n",mpi_self,*max_particles); #endif } } count = 0; /* If at one end, shift particles */ if( m_start == 0 ){ for( i = ghosts; i <= 2*ghosts-1; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count].x = particles[p].x + x; particles[start+count].y = particles[p].y; particles[start+count].z = particles[p].z; particles[start+count].q = particles[p].q; particles[start+count].e = particles[p].e; particles[start+count].fx = particles[p].fx; particles[start+count].fy = particles[p].fy; particles[start+count].fz = particles[p].fz; count++; p = ll[p]; } } }else{ for( i = ghosts; i <= 2*ghosts-1; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count].x = particles[p].x; particles[start+count].y = particles[p].y; particles[start+count].z = particles[p].z; particles[start+count].q = particles[p].q; particles[start+count].e = particles[p].e; particles[start+count].fx = particles[p].fx; particles[start+count].fy = particles[p].fy; particles[start+count].fz = particles[p].fz; count++; p = ll[p]; } } } if( mpi_left == mpi_self ){ /* if( periodic ){ */ for( p = 0; p < count; p++ ){ particles[*n_stored_particles+p].x = particles[start+p].x; particles[*n_stored_particles+p].y = particles[start+p].y; particles[*n_stored_particles+p].z = particles[start+p].z; particles[*n_stored_particles+p].q = particles[start+p].q; particles[*n_stored_particles+p].e = particles[start+p].e; particles[*n_stored_particles+p].fx = particles[start+p].fx; particles[*n_stored_particles+p].fy = particles[start+p].fy; particles[*n_stored_particles+p].fz = particles[start+p].fz; } *n_stored_particles += count; /* } */ } else{ MPI_Isend( &particles[start], count, mpi_type_particle, mpi_left, 1, mpi_comm_cart, &mpi_req[0] ); MPI_Irecv( &particles[*n_stored_particles], *max_particles-*n_stored_particles, mpi_type_particle, mpi_right, 1, mpi_comm_cart, &mpi_req[1] ); MPI_Waitall( 2, mpi_req, mpi_stat ); MPI_Get_count( &mpi_stat[1], mpi_type_particle, &mpi_count ); count = mpi_count; *n_stored_particles += count; } if( *n_stored_particles >= start ) { printf("Buffer too small!\n"); exit(1); } /* Updating linked list */ if( count > 0 ) update_linked_list( ll, hoc, particles,*n_stored_particles-count, *n_stored_particles, x, y, z, m, n, o, m_start-ghosts, m_end+ghosts, n_start-ghosts, n_end+ghosts, o_start-ghosts, o_end+ghosts ); /* -------------------------------------------------------------------------- * * Sending particles to upper neighbor * * -------------------------------------------------------------------------- */ count = 0; for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = n_end-n_start+1; j <= n_end-n_start+ghosts; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ count++; p = ll[p]; } } #ifdef DEBUG printf("Rank %d: n_stored_particles = %d, max_particles = %d, count = %d\n",mpi_self,*n_stored_particles,*max_particles,count); #endif start = *max_particles - count; while( start <= ( *n_stored_particles + 27*count ) ) { *max_particles = *max_particles*2; *particles_addr = (pp3mg_particle*) realloc(particles,*max_particles*sizeof(pp3mg_particle)); *ll_addr = (int*) realloc(ll,*max_particles*sizeof(int)); if (*particles_addr == NULL || *ll_addr == NULL) { printf("Realloc failed!"); exit(1); } else { particles = *particles_addr; ll = *ll_addr; for (int i=*max_particles>>1; i<*max_particles; i++) ll[i] = -1; start = *max_particles - count; #ifdef DEBUG printf("Rank %d: Reallocated. Now max_particles = %d\n",mpi_self,*max_particles); #endif } } count = 0; /* If at one end, shift particles */ if( n_end == (n-1) ){ for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = n_end-n_start+1; j <= n_end-n_start+ghosts; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count].x = particles[p].x; particles[start+count].y = particles[p].y - y; particles[start+count].z = particles[p].z; particles[start+count].q = particles[p].q; particles[start+count].e = particles[p].e; particles[start+count].fx = particles[p].fx; particles[start+count].fy = particles[p].fy; particles[start+count].fz = particles[p].fz; count++; p = ll[p]; } } }else{ for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = n_end-n_start+1; j <= n_end-n_start+ghosts; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count].x = particles[p].x; particles[start+count].y = particles[p].y; particles[start+count].z = particles[p].z; particles[start+count].q = particles[p].q; particles[start+count].e = particles[p].e; particles[start+count].fx = particles[p].fx; particles[start+count].fy = particles[p].fy; particles[start+count].fz = particles[p].fz; count++; p = ll[p]; } } } if( mpi_upper == mpi_self ){ /* if( periodic ){ */ for( p = 0; p < count; p++ ){ particles[*n_stored_particles+p].x = particles[start+p].x; particles[*n_stored_particles+p].y = particles[start+p].y; particles[*n_stored_particles+p].z = particles[start+p].z; particles[*n_stored_particles+p].q = particles[start+p].q; particles[*n_stored_particles+p].e = particles[start+p].e; particles[*n_stored_particles+p].fx = particles[start+p].fx; particles[*n_stored_particles+p].fy = particles[start+p].fy; particles[*n_stored_particles+p].fz = particles[start+p].fz; } *n_stored_particles += count; /* } */ } else{ MPI_Isend( &particles[start], count, mpi_type_particle, mpi_upper, 1, mpi_comm_cart, &mpi_req[0] ); MPI_Irecv( &particles[*n_stored_particles], *max_particles-*n_stored_particles, mpi_type_particle, mpi_lower, 1, mpi_comm_cart, &mpi_req[1] ); MPI_Waitall( 2, mpi_req, mpi_stat ); MPI_Get_count( &mpi_stat[1], mpi_type_particle, &mpi_count ); count = mpi_count; *n_stored_particles += count; } if( *n_stored_particles >= start ) { printf("Buffer too small!\n"); exit(1); } /* Updating linked list */ if( count > 0 ) update_linked_list( ll, hoc, particles,*n_stored_particles-count, *n_stored_particles, x, y, z, m, n, o, m_start-ghosts, m_end+ghosts, n_start-ghosts, n_end+ghosts, o_start-ghosts, o_end+ghosts ); /* -------------------------------------------------------------------------- * * Sending particles to lower neighbor * * -------------------------------------------------------------------------- */ count = 0; for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = ghosts; j <= 2*ghosts-1; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ count++; p = ll[p]; } } #ifdef DEBUG printf("Rank %d: n_stored_particles = %d, max_particles = %d, count = %d\n",mpi_self,*n_stored_particles,*max_particles,count); #endif start = *max_particles - count; while( start <= ( *n_stored_particles + 27*count ) ) { *max_particles = *max_particles*2; *particles_addr = (pp3mg_particle*) realloc(particles,*max_particles*sizeof(pp3mg_particle)); *ll_addr = (int*) realloc(ll,*max_particles*sizeof(int)); if (*particles_addr == NULL || *ll_addr == NULL) { printf("Realloc failed!"); exit(1); } else { particles = *particles_addr; ll = *ll_addr; for (int i=*max_particles>>1; i<*max_particles; i++) ll[i] = -1; start = *max_particles - count; #ifdef DEBUG printf("Rank %d: Reallocated. Now max_particles = %d\n",mpi_self,*max_particles); #endif } } count = 0; /* If at one end, shift particles */ if( n_start == 0 ){ for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = ghosts; j <= 2*ghosts-1; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count].x = particles[p].x; particles[start+count].y = particles[p].y + y; particles[start+count].z = particles[p].z; particles[start+count].q = particles[p].q; particles[start+count].e = particles[p].e; particles[start+count].fx = particles[p].fx; particles[start+count].fy = particles[p].fy; particles[start+count].fz = particles[p].fz; count++; p = ll[p]; } } }else{ for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = ghosts; j <= 2*ghosts-1; j++ ) for( k = 0; k <= o_end-o_start+2*ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count].x = particles[p].x; particles[start+count].y = particles[p].y; particles[start+count].z = particles[p].z; particles[start+count].q = particles[p].q; particles[start+count].e = particles[p].e; particles[start+count].fx = particles[p].fx; particles[start+count].fy = particles[p].fy; particles[start+count].fz = particles[p].fz; count++; p = ll[p]; } } } if( mpi_lower == mpi_self ){ /* if( periodic ){ */ for( p = 0; p < count; p++ ){ particles[*n_stored_particles+p].x = particles[start+p].x; particles[*n_stored_particles+p].y = particles[start+p].y; particles[*n_stored_particles+p].z = particles[start+p].z; particles[*n_stored_particles+p].q = particles[start+p].q; particles[*n_stored_particles+p].e = particles[start+p].e; particles[*n_stored_particles+p].fx = particles[start+p].fx; particles[*n_stored_particles+p].fy = particles[start+p].fy; particles[*n_stored_particles+p].fz = particles[start+p].fz; } *n_stored_particles += count; /* } */ } else{ MPI_Isend( &particles[start], count, mpi_type_particle, mpi_lower, 1, mpi_comm_cart, &mpi_req[0] ); MPI_Irecv( &particles[*n_stored_particles], *max_particles-*n_stored_particles, mpi_type_particle, mpi_upper, 1, mpi_comm_cart, &mpi_req[1] ); MPI_Waitall( 2, mpi_req, mpi_stat ); MPI_Get_count( &mpi_stat[1], mpi_type_particle, &mpi_count ); count = mpi_count; *n_stored_particles += count; } if( *n_stored_particles >= start ) { printf("Buffer too small!\n"); exit(1); } /* Updating linked list */ if( count > 0 ) update_linked_list( ll, hoc, particles,*n_stored_particles-count, *n_stored_particles, x, y, z, m, n, o, m_start-ghosts, m_end+ghosts, n_start-ghosts, n_end+ghosts, o_start-ghosts, o_end+ghosts ); /* -------------------------------------------------------------------------- * * Sending particles to front neighbor * * -------------------------------------------------------------------------- */ count = 0; for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = o_end-o_start+1; k <= o_end-o_start+ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ count++; p = ll[p]; } } #ifdef DEBUG printf("Rank %d: n_stored_particles = %d, max_particles = %d, count = %d\n",mpi_self,*n_stored_particles,*max_particles,count); #endif start = *max_particles - count; while( start <= ( *n_stored_particles + 27*count ) ) { *max_particles = *max_particles*2; *particles_addr = (pp3mg_particle*) realloc(particles,*max_particles*sizeof(pp3mg_particle)); *ll_addr = (int*) realloc(ll,*max_particles*sizeof(int)); if (*particles_addr == NULL || *ll_addr == NULL) { printf("Realloc failed!"); exit(1); } else { particles = *particles_addr; ll = *ll_addr; for (int i=*max_particles>>1; i<*max_particles; i++) ll[i] = -1; start = *max_particles - count; #ifdef DEBUG printf("Rank %d: Reallocated. Now max_particles = %d\n",mpi_self,*max_particles); #endif } } count = 0; /* If at one end, shift particles */ if( o_end == (o-1) ){ for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = o_end-o_start+1; k <= o_end-o_start+ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count].x = particles[p].x; particles[start+count].y = particles[p].y; particles[start+count].z = particles[p].z - z; particles[start+count].q = particles[p].q; particles[start+count].e = particles[p].e; particles[start+count].fx = particles[p].fx; particles[start+count].fy = particles[p].fy; particles[start+count].fz = particles[p].fz; count++; p = ll[p]; } } }else{ for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = o_end-o_start+1; k <= o_end-o_start+ghosts; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count].x = particles[p].x; particles[start+count].y = particles[p].y; particles[start+count].z = particles[p].z; particles[start+count].q = particles[p].q; particles[start+count].e = particles[p].e; particles[start+count].fx = particles[p].fx; particles[start+count].fy = particles[p].fy; particles[start+count].fz = particles[p].fz; count++; p = ll[p]; } } } if( mpi_front == mpi_self ){ /* if( periodic ){ */ for( p = 0; p < count; p++ ){ particles[*n_stored_particles+p].x = particles[start+p].x; particles[*n_stored_particles+p].y = particles[start+p].y; particles[*n_stored_particles+p].z = particles[start+p].z; particles[*n_stored_particles+p].q = particles[start+p].q; particles[*n_stored_particles+p].e = particles[start+p].e; particles[*n_stored_particles+p].fx = particles[start+p].fx; particles[*n_stored_particles+p].fy = particles[start+p].fy; particles[*n_stored_particles+p].fz = particles[start+p].fz; } *n_stored_particles += count; /* } */ } else{ MPI_Isend( &particles[start], count, mpi_type_particle, mpi_front, 1, mpi_comm_cart, &mpi_req[0] ); MPI_Irecv( &particles[*n_stored_particles], *max_particles-*n_stored_particles, mpi_type_particle, mpi_back, 1, mpi_comm_cart, &mpi_req[1] ); MPI_Waitall( 2, mpi_req, mpi_stat ); MPI_Get_count( &mpi_stat[1], mpi_type_particle, &mpi_count ); count = mpi_count; *n_stored_particles += count; } if( *n_stored_particles >= start ) { printf("Buffer too small!\n"); exit(1); } /* Updating linked list */ if( count > 0 ) update_linked_list( ll, hoc, particles,*n_stored_particles-count, *n_stored_particles, x, y, z, m, n, o, m_start-ghosts, m_end+ghosts, n_start-ghosts, n_end+ghosts, o_start-ghosts, o_end+ghosts ); /* -------------------------------------------------------------------------- * * Sending particles to back neighbor * * -------------------------------------------------------------------------- */ count = 0; for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = ghosts; k <= 2*ghosts-1; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ count++; p = ll[p]; } } #ifdef DEBUG printf("Rank %d: n_stored_particles = %d, max_particles = %d, count = %d\n",mpi_self,*n_stored_particles,*max_particles,count); #endif start = *max_particles - count; while( start <= ( *n_stored_particles + 27*count ) ) { *max_particles = *max_particles*2; *particles_addr = (pp3mg_particle*) realloc(particles,*max_particles*sizeof(pp3mg_particle)); *ll_addr = (int*) realloc(ll,*max_particles*sizeof(int)); if (*particles_addr == NULL || *ll_addr == NULL) { printf("Realloc failed!"); exit(1); } else { particles = *particles_addr; ll = *ll_addr; for (int i=*max_particles>>1; i<*max_particles; i++) ll[i] = -1; start = *max_particles - count; #ifdef DEBUG printf("Rank %d: Reallocated. Now max_particles = %d\n",mpi_self,*max_particles); #endif } } count = 0; /* If at one end, shift particles */ if( o_start == 0 ){ for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = ghosts; k <= 2*ghosts-1; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count].x = particles[p].x; particles[start+count].y = particles[p].y; particles[start+count].z = particles[p].z + z; particles[start+count].q = particles[p].q; particles[start+count].e = particles[p].e; particles[start+count].fx = particles[p].fx; particles[start+count].fy = particles[p].fy; particles[start+count].fz = particles[p].fz; count++; p = ll[p]; } } } else{ for( i = 0; i <= m_end-m_start+2*ghosts; i++ ) for( j = 0; j <= n_end-n_start+2*ghosts; j++ ) for( k = ghosts; k <= 2*ghosts-1; k++ ){ p = hoc[i][j][k]; while( p >= 0 ){ particles[start+count].x = particles[p].x; particles[start+count].y = particles[p].y; particles[start+count].z = particles[p].z; particles[start+count].q = particles[p].q; particles[start+count].e = particles[p].e; particles[start+count].fx = particles[p].fx; particles[start+count].fy = particles[p].fy; particles[start+count].fz = particles[p].fz; count++; p = ll[p]; } } } if( mpi_back == mpi_self ){ /* if( periodic ){ */ for( p = 0; p < count; p++ ){ particles[*n_stored_particles+p].x = particles[start+p].x; particles[*n_stored_particles+p].y = particles[start+p].y; particles[*n_stored_particles+p].z = particles[start+p].z; particles[*n_stored_particles+p].q = particles[start+p].q; particles[*n_stored_particles+p].e = particles[start+p].e; particles[*n_stored_particles+p].fx = particles[start+p].fx; particles[*n_stored_particles+p].fy = particles[start+p].fy; particles[*n_stored_particles+p].fz = particles[start+p].fz; } *n_stored_particles += count; /* } */ }else{ MPI_Isend( &particles[start], count, mpi_type_particle, mpi_back, 1, mpi_comm_cart, &mpi_req[0] ); MPI_Irecv( &particles[*n_stored_particles], *max_particles-*n_stored_particles, mpi_type_particle, mpi_front, 1, mpi_comm_cart, &mpi_req[1] ); MPI_Waitall( 2, mpi_req, mpi_stat ); MPI_Get_count( &mpi_stat[1], mpi_type_particle, &mpi_count ); count = mpi_count; *n_stored_particles += count; } if( *n_stored_particles >= start ) { printf("Buffer too small!\n"); exit(1); } /* Updating linked list */ if( count > 0 ) update_linked_list( ll, hoc, particles,*n_stored_particles-count, *n_stored_particles, x, y, z, m, n, o, m_start-ghosts, m_end+ghosts, n_start-ghosts, n_end+ghosts, o_start-ghosts, o_end+ghosts ); MPI_Type_free(&mpi_type_particle); }
/* mpi_master() * The MPI version of hmmbuild. * Follows standard pattern for a master/worker load-balanced MPI program (J1/78-79). * * A master can only return if it's successful. * Errors in an MPI master come in two classes: recoverable and nonrecoverable. * * Recoverable errors include all worker-side errors, and any * master-side error that do not affect MPI communication. Error * messages from recoverable messages are delayed until we've cleanly * shut down the workers. * * Unrecoverable errors are master-side errors that may affect MPI * communication, meaning we cannot count on being able to reach the * workers and shut them down. Unrecoverable errors result in immediate * p7_Fail()'s, which will cause MPI to shut down the worker processes * uncleanly. */ static void mpi_master(const ESL_GETOPTS *go, struct cfg_s *cfg) { int xstatus = eslOK; /* changes from OK on recoverable error */ int status; int have_work = TRUE; /* TRUE while alignments remain */ int nproc_working = 0; /* number of worker processes working, up to nproc-1 */ int wi; /* rank of next worker to get an alignment to work on */ char *buf = NULL; /* input/output buffer, for packed MPI messages */ int bn = 0; ESL_MSA *msa = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; ESL_MSA **msalist = NULL; ESL_MSA *postmsa = NULL; int *msaidx = NULL; char errmsg[eslERRBUFSIZE]; MPI_Status mpistatus; int n; int pos; double entropy; /* Master initialization: including, figure out the alphabet type. * If any failure occurs, delay printing error message until we've shut down workers. */ if (xstatus == eslOK) { if ((status = init_master_cfg(go, cfg, errmsg)) != eslOK) xstatus = status; } if (xstatus == eslOK) { bn = 4096; if ((buf = malloc(sizeof(char) * bn)) == NULL) { sprintf(errmsg, "allocation failed"); xstatus = eslEMEM; } } if (xstatus == eslOK) { if ((msalist = malloc(sizeof(ESL_MSA *) * cfg->nproc)) == NULL) { sprintf(errmsg, "allocation failed"); xstatus = eslEMEM; } } if (xstatus == eslOK) { if ((msaidx = malloc(sizeof(int) * cfg->nproc)) == NULL) { sprintf(errmsg, "allocation failed"); xstatus = eslEMEM; } } MPI_Bcast(&xstatus, 1, MPI_INT, 0, MPI_COMM_WORLD); if (xstatus != eslOK) { MPI_Finalize(); p7_Fail(errmsg); } ESL_DPRINTF1(("MPI master is initialized\n")); bg = p7_bg_Create(cfg->abc); for (wi = 0; wi < cfg->nproc; wi++) { msalist[wi] = NULL; msaidx[wi] = 0; } /* Worker initialization: * Because we've already successfully initialized the master before we start * initializing the workers, we don't expect worker initialization to fail; * so we just receive a quick OK/error code reply from each worker to be sure, * and don't worry about an informative message. */ MPI_Bcast(&(cfg->abc->type), 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Reduce(&xstatus, &status, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD); if (status != eslOK) { MPI_Finalize(); p7_Fail("One or more MPI worker processes failed to initialize."); } ESL_DPRINTF1(("%d workers are initialized\n", cfg->nproc-1)); /* Main loop: combining load workers, send/receive, clear workers loops; * also, catch error states and die later, after clean shutdown of workers. * * When a recoverable error occurs, have_work = FALSE, xstatus != * eslOK, and errmsg is set to an informative message. No more * errmsg's can be received after the first one. We wait for all the * workers to clear their work units, then send them shutdown signals, * then finally print our errmsg and exit. * * Unrecoverable errors just crash us out with p7_Fail(). */ wi = 1; while (have_work || nproc_working) { if (have_work) { if ((status = esl_msa_Read(cfg->afp, &msa)) == eslOK) { cfg->nali++; ESL_DPRINTF1(("MPI master read MSA %s\n", msa->name == NULL? "" : msa->name)); } else { have_work = FALSE; if (status == eslEFORMAT) { xstatus = eslEFORMAT; snprintf(errmsg, eslERRBUFSIZE, "Alignment file parse error:\n%s\n", cfg->afp->errbuf); } else if (status == eslEINVAL) { xstatus = eslEFORMAT; snprintf(errmsg, eslERRBUFSIZE, "Alignment file parse error:\n%s\n", cfg->afp->errbuf); } else if (status != eslEOF) { xstatus = status; snprintf(errmsg, eslERRBUFSIZE, "Alignment file read unexpectedly failed with code %d\n", status); } ESL_DPRINTF1(("MPI master has run out of MSAs (having read %d)\n", cfg->nali)); } } if ((have_work && nproc_working == cfg->nproc-1) || (!have_work && nproc_working > 0)) { if (MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &mpistatus) != 0) { MPI_Finalize(); p7_Fail("mpi probe failed"); } if (MPI_Get_count(&mpistatus, MPI_PACKED, &n) != 0) { MPI_Finalize(); p7_Fail("mpi get count failed"); } wi = mpistatus.MPI_SOURCE; ESL_DPRINTF1(("MPI master sees a result of %d bytes from worker %d\n", n, wi)); if (n > bn) { if ((buf = realloc(buf, sizeof(char) * n)) == NULL) p7_Fail("reallocation failed"); bn = n; } if (MPI_Recv(buf, bn, MPI_PACKED, wi, 0, MPI_COMM_WORLD, &mpistatus) != 0) { MPI_Finalize(); p7_Fail("mpi recv failed"); } ESL_DPRINTF1(("MPI master has received the buffer\n")); /* If we're in a recoverable error state, we're only clearing worker results; * just receive them, don't unpack them or print them. * But if our xstatus is OK, go ahead and process the result buffer. */ if (xstatus == eslOK) { pos = 0; if (MPI_Unpack(buf, bn, &pos, &xstatus, 1, MPI_INT, MPI_COMM_WORLD) != 0) { MPI_Finalize(); p7_Fail("mpi unpack failed");} if (xstatus == eslOK) /* worker reported success. Get the HMM. */ { ESL_DPRINTF1(("MPI master sees that the result buffer contains an HMM\n")); if (p7_hmm_MPIUnpack(buf, bn, &pos, MPI_COMM_WORLD, &(cfg->abc), &hmm) != eslOK) { MPI_Finalize(); p7_Fail("HMM unpack failed"); } ESL_DPRINTF1(("MPI master has unpacked the HMM\n")); if (cfg->postmsafile != NULL) { if (esl_msa_MPIUnpack(cfg->abc, buf, bn, &pos, MPI_COMM_WORLD, &postmsa) != eslOK) { MPI_Finalize(); p7_Fail("postmsa unpack failed");} } entropy = p7_MeanMatchRelativeEntropy(hmm, bg); if ((status = output_result(cfg, errmsg, msaidx[wi], msalist[wi], hmm, postmsa, entropy)) != eslOK) xstatus = status; esl_msa_Destroy(postmsa); postmsa = NULL; p7_hmm_Destroy(hmm); hmm = NULL; } else /* worker reported an error. Get the errmsg. */ { if (MPI_Unpack(buf, bn, &pos, errmsg, eslERRBUFSIZE, MPI_CHAR, MPI_COMM_WORLD) != 0) { MPI_Finalize(); p7_Fail("mpi unpack of errmsg failed"); } ESL_DPRINTF1(("MPI master sees that the result buffer contains an error message\n")); } } esl_msa_Destroy(msalist[wi]); msalist[wi] = NULL; msaidx[wi] = 0; nproc_working--; } if (have_work) { ESL_DPRINTF1(("MPI master is sending MSA %s to worker %d\n", msa->name == NULL ? "":msa->name, wi)); if (esl_msa_MPISend(msa, wi, 0, MPI_COMM_WORLD, &buf, &bn) != eslOK) p7_Fail("MPI msa send failed"); msalist[wi] = msa; msaidx[wi] = cfg->nali; /* 1..N for N alignments in the MSA database */ msa = NULL; wi++; nproc_working++; } } /* On success or recoverable errors: * Shut down workers cleanly. */ ESL_DPRINTF1(("MPI master is done. Shutting down all the workers cleanly\n")); for (wi = 1; wi < cfg->nproc; wi++) if (esl_msa_MPISend(NULL, wi, 0, MPI_COMM_WORLD, &buf, &bn) != eslOK) p7_Fail("MPI msa send failed"); free(buf); free(msaidx); free(msalist); p7_bg_Destroy(bg); if (xstatus != eslOK) { MPI_Finalize(); p7_Fail(errmsg); } else return; }
void master(const struct fracInfo info) { int ntasks, dest, msgsize; struct fracData *work = malloc(sizeof(*work)); MPI_Status status; int rowsTaken = 0; MPI_Comm_size(MPI_COMM_WORLD, &ntasks); size_t size = sizeof(unsigned char) * (unsigned long)info.nCols * (unsigned long)info.nRows; unsigned char *fractal = (unsigned char*)malloc(size); if(!fractal) { printf("fractal allocation failed, %lu bytes\n", size); exit(1); } // Allocate buffer int membersize, emptysize, fullsize; int position; char *buffer; MPI_Pack_size(1, MPI_INT, MPI_COMM_WORLD, &membersize); emptysize = membersize; MPI_Pack_size(1, MPI_INT, MPI_COMM_WORLD, &membersize); emptysize += membersize; MPI_Pack_size(get_max_work_size(&info), MPI_UNSIGNED_CHAR, MPI_COMM_WORLD, &membersize); fullsize = emptysize + membersize; buffer = malloc(fullsize); if(!buffer) { printf("buffer allocation failed, %d bytes\n",fullsize); exit(1); } // Send initial data for (dest = 1; dest < ntasks; dest++) { //Get next work item get_work(&info,&rowsTaken,work); //pack and send work position = 0; MPI_Pack(&work->startRow,1,MPI_INT,buffer,emptysize,&position,MPI_COMM_WORLD); MPI_Pack(&work->nRows,1,MPI_INT,buffer,emptysize,&position,MPI_COMM_WORLD); MPI_Send(buffer, position, MPI_PACKED, dest, WORKTAG, MPI_COMM_WORLD); } printf("sent initial work\n"); //Get next work item get_work(&info,&rowsTaken,work); int startRow, nRows; while(work->nRows) { // Recieve and unpack work MPI_Recv(buffer, fullsize, MPI_PACKED, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); position = 0; MPI_Get_count(&status, MPI_PACKED, &msgsize); MPI_Unpack(buffer, msgsize, &position, &startRow,1,MPI_INT,MPI_COMM_WORLD); MPI_Unpack(buffer, msgsize, &position, &nRows,1,MPI_INT,MPI_COMM_WORLD); MPI_Unpack(buffer, msgsize, &position, fractal+((unsigned long)startRow*info.nCols), nRows*info.nCols, MPI_UNSIGNED_CHAR, MPI_COMM_WORLD); //pack and send work position = 0; MPI_Pack(&work->startRow,1,MPI_INT,buffer,emptysize,&position,MPI_COMM_WORLD); MPI_Pack(&work->nRows,1,MPI_INT,buffer,emptysize,&position,MPI_COMM_WORLD); MPI_Send(buffer, position, MPI_PACKED, status.MPI_SOURCE, WORKTAG, MPI_COMM_WORLD); //Get next work item get_work(&info,&rowsTaken,work); if(status.MPI_SOURCE==1) printf("%d\n",work->startRow); } // Recieve all remaining work for (dest = 1; dest < ntasks; dest++) { // Recieve and unpack work MPI_Recv(buffer, fullsize, MPI_PACKED, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); position = 0; MPI_Get_count(&status, MPI_PACKED, &msgsize); MPI_Unpack(buffer, msgsize, &position, &startRow,1,MPI_INT,MPI_COMM_WORLD); MPI_Unpack(buffer, msgsize, &position, &nRows,1,MPI_INT,MPI_COMM_WORLD); // unpack pixel data MPI_Unpack(buffer, msgsize, &position, fractal+((unsigned long)startRow*info.nCols), nRows*info.nCols, MPI_UNSIGNED_CHAR, MPI_COMM_WORLD); // Kill slaves MPI_Send(0,0,MPI_INT,dest,DIETAG,MPI_COMM_WORLD); } free(work); free(buffer); //Save image as TIFF unsigned int nx = info.nCols; unsigned int ny = info.nRows; char fileName[] = "/home/pi/Mandelbrot/Mandelbrot.tiff"; TIFF *out = TIFFOpen(fileName, "w"); uint32 tileDim = 256; tsize_t tileBytes = tileDim*tileDim*sizeof(char); unsigned char *buf = (unsigned char *)_TIFFmalloc(tileBytes); char description[1024]; snprintf(description, sizeof(description),"xStart:%f yStart:%f spacing:%f AAx:%d",info.xStart,info.yStart,info.spacing,info.AA); TIFFSetField(out, TIFFTAG_IMAGEDESCRIPTION, description); TIFFSetField(out, TIFFTAG_IMAGEWIDTH, (uint32) nx); TIFFSetField(out, TIFFTAG_IMAGELENGTH, (uint32) ny); TIFFSetField(out, TIFFTAG_ORIENTATION, ORIENTATION_TOPLEFT); TIFFSetField(out, TIFFTAG_SAMPLESPERPIXEL, 1); TIFFSetField(out, TIFFTAG_BITSPERSAMPLE, 8); TIFFSetField(out, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG); TIFFSetField(out, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_MINISBLACK); TIFFSetField(out, TIFFTAG_COMPRESSION, COMPRESSION_LZW); TIFFSetField(out, TIFFTAG_TILEWIDTH, tileDim); TIFFSetField(out, TIFFTAG_TILELENGTH, tileDim); // TIFFSetField(out, TIFFTAG_PREDICTOR, PREDICTOR_HORIZONTAL); // TIFFSetField(out, TIFFTAG_XRESOLUTION, resolution); // TIFFSetField(out, TIFFTAG_YRESOLUTION, resolution); // TIFFSetField(out, TIFFTAG_RESOLUTIONUNIT, RESUNIT_INCH); unsigned long x,y,i,j; unsigned long tileStart; // Iterate through and write tiles for(y=0; y<ny; y+=tileDim) { for(x=0; x<nx; x+=tileDim) { // Fill tile with fractal data tileStart = y*nx+x; for(i=0; i<tileDim; i++) { for(j=0; j<tileDim; j++) { if(x+j < nx && y+i < ny) buf[i*tileDim+j] = fractal[(y+i)*nx+(x+j)]; else buf[i*tileDim+j] = (unsigned char)0; } } TIFFWriteTile(out, buf, x, y, 0, 0); } } TIFFClose(out); _TIFFfree(buf); free(fractal); }
int IrcvBuf(intbuf_t* bufs, intbuf_t serversbuf, int* w, MPI_Comm comm, int* tag, int* size){ // if there is a send request pending, // receive UPDATE_END and UPDATE_MSG // else // if there's processing pending // receive UPDATE_END, UPDATE_MSG, NEWIDS_END // else // receive UPDATE_END, UPDATE_MSG, NEWIDS_END, NEWIDS_MSG intbuf_t bb = NULL; MPI_Status status; int flag; // Warning(info,"$$$$$$$$$ (W%d) SP: %d PP: %d ", mpi_me, send_pending, bufnewids_pending); if (send_pending) { MPI_Iprobe( MPI_ANY_SOURCE, UPDATE_END, comm, &flag, &status ); if (!flag) MPI_Iprobe( MPI_ANY_SOURCE, UPDATE_MSG, comm, &flag, &status ); // if (!flag) printf("\n...%d probing...", mpi_me); } else { if (bufnewids_pending) { MPI_Iprobe( MPI_ANY_SOURCE, UPDATE_END, comm, &flag, &status ); if (!flag) MPI_Iprobe( MPI_ANY_SOURCE, UPDATE_MSG, comm, &flag, &status ); if (!flag) MPI_Iprobe( MPI_ANY_SOURCE, NEWIDS_END, comm, &flag, &status ); // if (!flag) printf("\n...%d PROBING...", mpi_me); } else { MPI_Probe(MPI_ANY_SOURCE,MPI_ANY_TAG,comm, &status); //MPI_Iprobe(MPI_ANY_SOURCE,MPI_ANY_TAG,comm, &flag, &status); flag = 1; } }; if (!flag) return 0; MPI_Get_count(&status, MPI_INT, size); *w = status.MPI_SOURCE; *tag = status.MPI_TAG; if ((*tag == NEWIDS_MSG) || (*tag == NEWIDS_END)) bb = serversbuf; else bb = bufs[*w]; MPI_Recv(bb->b, *size, MPI_INT, status.MPI_SOURCE, *tag, comm, &status); bb->index = 0; bb->size = *size; #ifdef DEBUG Warning(info,"\n %d IrcvBuf %s (size %d) from %d", mpi_me, tname[*tag], *size, *w); #endif return 1; }
int main(int argc, char **argv) { double sum_total_timer, total_timer = 0.0; double sum_gather_timer, gather_timer = 0.0; double sum_mpi_timer, mpi_timer = 0.0; double curr_time; double output_time; double dt = 0.0; double local_max_norm = 0.1; double max_norm = 0; int steps; int* fish_off; int* n_fish_split; MPI_Init (&argc, &argv); #ifdef TRACE_WITH_VAMPIR VT_symdef(TRACE_LOCAL_COMP, "Local computation", "Computation"); VT_symdef(TRACE_FISH_GATHER, "Gathering to 0", "Communication"); VT_symdef(TRACE_MAX_NORM, "Collecting max norm", "Communication"); VT_symdef(TRACE_OUTPUT, "Output", "Output"); #endif MPI_Comm_size (comm, &n_proc); MPI_Comm_rank (comm, &rank); make_fishtype (&fishtype); get_options(argc, argv); srand48(clock()); //MPI_Allreduce (&local_max_norm, &max_norm, 1, MPI_DOUBLE, MPI_MAX, comm); //printf("local_max_norm = %g, max_norm = %g\n", local_max_norm, max_norm); #ifdef TRACE_WITH_VAMPIR VT_traceoff(); #endif if (output_filename) { outputp = 1; if (0 == rank) { output_fp = fopen(output_filename, "w"); if (output_fp == NULL) { printf("Could not open %s for output\n", output_filename); exit(1); } fprintf(output_fp, "n_fish: %d\n", n_fish); } } fish_off = malloc ( (n_proc+1) * sizeof(int) ); n_fish_split = malloc ( (n_proc) * sizeof(int) ); //split each fish to different processors. //fish_off: offset index of the fish in that processor //n_fish_split is the # of fish in each processor //ALL FUNCTIONALITY OF split_fish SHOULD BE DONE AFTER init_fish //split_fish (n_proc, fish_off, n_fish_split); //n_local_fish = n_fish_split[rank]; /* All fish are generated on proc 0 to ensure same random numbers. (Yes, the circle case could be parallelized. Feel free to do it.) */ //split physical box sizes row = (int)sqrt((double)n_proc); column = n_proc/row; double rowSep = WALL_SEP/row; double columnSep = WALL_SEP/column; int rowIndex = rank / column; int columnIndex = rank % column; topBound = rowSep * rowIndex; bottomBound = topBound + rowSep; leftBound = columnSep * columnIndex; rightBound = leftBound + columnSep; assert(n_proc % row == 0); // Add n_proc # of arrays each holding ID of local fishes fish_t fishProc[n_proc][n_fish]; int n_fish_proc[n_proc]; int k; for (k = 0; k < n_proc; k++) n_fish_proc[k] = 0; ////////////////////////////////// init_fish (rank, fish_off, n_fish_split, row, column, fishProc, n_fish_proc); // distribute initial conditions to all processes if (rank == 0) { local_fish = fishProc[0]; n_local_fish = n_fish_proc[0]; // Functionality of MPI_Scatterv is done here with Isends //MPI_Request request[n_proc-1]; int mesTag = 0; MPI_Request *req; for (k = 1; k < n_proc; ++k) { //printf("n_fish_proc[%d], %d\n", k, n_fish_proc[k]); MPI_Isend(fishProc[k], n_fish_proc[k], fishtype, k, mesTag, comm, req); } } else { MPI_Status status; // Processors of rank != 0 receives. MPI_Recv( local_fish, n_fish, fishtype, 0, MPI_ANY_TAG, comm, &status); MPI_Get_count(&status, fishtype, &n_local_fish); } printf("rank[%d], n_local_fish = %d\n", rank, n_local_fish); ///* //MPI_Scatterv (fish, n_fish_split, fish_off, fishtype, // local_fish, n_local_fish, fishtype, // 0, comm); //*/ #ifdef TRACE_WITH_VAMPIR tracingp = 1; VT_traceon(); #endif start_mpi_timer(&total_timer); for (output_time = 0.0, curr_time = 0.0, steps = 0; curr_time <= end_time && steps < max_steps; curr_time += dt, ++steps) { #ifdef TRACE_WITH_VAMPIR if (steps >= STEPS_TO_TRACE) { tracingp = 0; VT_traceoff(); } #endif trace_begin(TRACE_FISH_GATHER); start_mpi_timer (&gather_timer); start_mpi_timer (&mpi_timer); /* Pull in all the fish. Obviously, this is not a good idea. You will be greatly expanding this one line... However, feel free to waste memory when producing output. If you're dumping fish to a file, go ahead and do an Allgatherv _in the output steps_ if you want. Or you could pipeline dumping the fish. MPI_Allgatherv (local_fish, n_local_fish, fishtype, fish, n_fish_split, fish_off, fishtype, comm); */ //MPI_Request* sendReq, recvReq; // Set aside buffer for fish received from other processes. /* for (j = 0; j < NUM_NEIGHBOR; ++j) { //FIXME: which neighbors does not exist? if (rankNeighbor[j] >= 0) { MPI_Isend(local_fish, n_local_fish, fishtype, rankNeighbor[j], MPI_ANY_TAG, comm, &sendReqArray); MPI_Irecv(impact_fish, n_fish, fishtype, rankNeighbor[NUM_NEIGHBOR - j], MPI_ANY_TAG, comm, &sendReqArray); MPI_Wait(recvReq, MPI_STATUS_IGNORE); interact_fish_mpi(local_fish, n_local_fish, impact_fish, sizeof(impact_fish)); } } */ // get migrate fish // send migrate fish // receive migrate fish // update local fish // get impact fish // send impact fish // receive impact fish // interact impact fish // interact local fish // move MPI_Request sendReqArray[NUM_NEIGHBOR]; MPI_Request recvReqArray[NUM_NEIGHBOR]; fish_t receive_impact_fish[NUM_NEIGHBOR][n_fish]; int n_receive_impact_fish[NUM_NEIGHBOR]; fish_t receive_migrate_fish[NUM_NEIGHBOR][n_fish]; int n_receive_migrate_fish[NUM_NEIGHBOR]; int n_send_impact_fish[NUM_NEIGHBOR]; fish_t* send_impact_fish[NUM_NEIGHBOR]; int n_send_migrate_fish[NUM_NEIGHBOR]; fish_t* send_migrate_fish[NUM_NEIGHBOR]; get_interacting_fish( local_fish, n_local_fish, send_migrate_fish, n_send_migrate_fish, 1); int tmp; for (tmp = 0; tmp < NUM_NEIGHBOR; tmp++) { printf("rank[%d], iter[%d] ------- get [%d] migrate fish for neig[%d]. \n", rank, iter, n_send_migrate_fish[tmp], tmp); } Isend_receive_fish(send_migrate_fish, n_send_migrate_fish, receive_migrate_fish, n_fish, sendReqArray, recvReqArray); wait_for_fish(recvReqArray, n_receive_migrate_fish); // FIXME: Have not implement update on local fish. //update_local_fish(); get_interacting_fish(local_fish, n_local_fish, send_impact_fish, n_send_impact_fish, 0); for (tmp = 0; tmp < NUM_NEIGHBOR; tmp++) { printf("rank[%d], iter[%d] ------- get [%d] impact fish for neig[%d]. \n", rank, iter, n_send_impact_fish[tmp], tmp); } Isend_receive_fish(send_impact_fish, n_send_impact_fish, receive_impact_fish, n_fish, sendReqArray, recvReqArray); wait_for_fish(recvReqArray, n_receive_impact_fish); int index; for (index = 0; index < NUM_NEIGHBOR; index++) { if (n_receive_impact_fish[index] > 0) { interact_fish_mpi(local_fish, n_local_fish, receive_impact_fish[index], n_receive_impact_fish[index]); } } //*/ // make sure we are sending and receiving the same # msg. //assert(dbg == 0); // While waiting, interact with fish in its own pocket first printf("rank[%d], iter[%d] ------- interact [%d] local fishes\n", rank, iter, n_local_fish); interact_fish_mpi(local_fish, n_local_fish, local_fish, n_local_fish); printf("rank[%d], iter[%d] ------- finished interact local fish\n", rank, iter); stop_mpi_timer (&gather_timer); stop_mpi_timer (&mpi_timer); trace_end(TRACE_FISH_GATHER); /* We only output once every output_interval time unit, at most. Without that restriction, we can easily create a huge output file. Printing a record for ten fish takes about 300 bytes, so for every 1000 steps, we could dump 300K of info. Now scale the number of fish by 1000... */ trace_begin(TRACE_OUTPUT); if (outputp && curr_time >= output_time) { if (0 == rank) output_fish (output_fp, curr_time, dt, fish, n_fish); output_time = curr_time + output_interval; } trace_end(TRACE_OUTPUT); trace_begin (TRACE_LOCAL_COMP); //interact_fish (local_fish, n_local_fish, fish, n_fish); local_max_norm = compute_norm (local_fish, n_local_fish); trace_end (TRACE_LOCAL_COMP); trace_begin (TRACE_MAX_NORM); start_mpi_timer (&mpi_timer); printf("rank[%d], iter[%d] ------- Allreduce max_norm, \n", rank, iter); MPI_Allreduce (&local_max_norm, &max_norm, 1, MPI_DOUBLE, MPI_MAX, comm); printf("rank[%d], iter[%d] ------- local_max_norm: %g, max_norm: %g\n", local_max_norm, max_norm); stop_mpi_timer (&mpi_timer); trace_end (TRACE_MAX_NORM); trace_begin (TRACE_LOCAL_COMP); dt = max_norm_change / max_norm; dt = f_max(dt, min_dt); dt = f_min(dt, max_dt); printf("rank[%d], iter[%d] ------- moving [%d] local_fish, \n", rank, iter, n_local_fish); move_fish(local_fish, n_local_fish, dt); printf("rank[%d], iter[%d] ------- finished moving.\n", rank, iter); trace_end (TRACE_LOCAL_COMP); iter++; } stop_mpi_timer(&total_timer); #ifdef TRACE_WITH_VAMPIR VT_traceoff(); #endif if (outputp) { MPI_Allgatherv (local_fish, n_local_fish, fishtype, fish, n_fish_split, fish_off, fishtype, comm); if (0 == rank) { output_fish (output_fp, curr_time, dt, fish, n_fish); printf("\tEnded at %g (%g), %d (%d) steps\n", curr_time, end_time, steps, max_steps); } } printf("rank[%d], ------- 39, \n", rank); MPI_Reduce (&total_timer, &sum_total_timer, 1, MPI_DOUBLE, MPI_SUM, 0, comm); printf("rank[%d], ------- 40, \n", rank); MPI_Reduce (&gather_timer, &sum_gather_timer, 1, MPI_DOUBLE, MPI_SUM, 0, comm); printf("rank[%d], ------- 41, \n", rank); MPI_Reduce (&mpi_timer, &sum_mpi_timer, 1, MPI_DOUBLE, MPI_SUM, 0, comm); printf("rank[%d], ------- 42, \n", rank); if (0 == rank) { printf("Number of PEs: %d\n" "Time taken on 0: %g (avg. %g)\n" "Time in gathers on 0: %g (avg %g)\n" "Time in MPI on 0: %g (avg %g)\n", n_proc, total_timer, sum_total_timer / n_proc, gather_timer, sum_gather_timer / n_proc, mpi_timer, sum_mpi_timer / n_proc); } printf("rank[%d], ------- 43, \n", rank); MPI_Barrier (comm); printf("rank[%d], ------- 44, \n", rank); MPI_Finalize (); printf("rank[%d], ------- done!!, \n", rank); return 0; }
void RepartitionFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const { FactoryMonitor m(*this, "Build", currentLevel); const Teuchos::ParameterList & pL = GetParameterList(); // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation. // TODO (JG): I don't really know if we want to do this. const int startLevel = pL.get<int> ("repartition: start level"); const LO minRowsPerProcessor = pL.get<LO> ("repartition: min rows per proc"); const double nonzeroImbalance = pL.get<double>("repartition: max imbalance"); const bool remapPartitions = pL.get<bool> ("repartition: remap parts"); // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types. RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A"); // ====================================================================================================== // Determine whether partitioning is needed // ====================================================================================================== // NOTE: most tests include some global communication, which is why we currently only do tests until we make // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to // rebalance an operator. So, it would probably be beneficial to do and report *all* tests. // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning if (currentLevel.GetLevelID() < startLevel) { GetOStream(Statistics0) << "Repartitioning? NO:" << "\n current level = " << Teuchos::toString(currentLevel.GetLevelID()) << ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } RCP<const Map> rowMap = A->getRowMap(); // NOTE: Teuchos::MPIComm::duplicate() calls MPI_Bcast inside, so this is // a synchronization point. However, as we do MueLu_sumAll afterwards anyway, it // does not matter. RCP<const Teuchos::Comm<int> > origComm = rowMap->getComm(); RCP<const Teuchos::Comm<int> > comm = origComm->duplicate(); // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created) // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size { int numActiveProcesses = 0; MueLu_sumAll(comm, Teuchos::as<int>((A->getNodeNumRows() > 0) ? 1 : 0), numActiveProcesses); if (numActiveProcesses == 1) { GetOStream(Statistics0) << "Repartitioning? NO:" << "\n # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } } bool test3 = false, test4 = false; std::string msg3, msg4; // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3) if (minRowsPerProcessor > 0) { LO numMyRows = Teuchos::as<LO>(A->getNodeNumRows()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max(); LO haveFewRows = (numMyRows < minRowsPerProcessor ? 1 : 0), numWithFewRows = 0; MueLu_sumAll(comm, haveFewRows, numWithFewRows); MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows); // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements. // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement. if (numWithFewRows > 0) test3 = true; msg3 = "\n min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcessor); } // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold if (!test3) { GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getNodeNumEntries()); MueLu_maxAll(comm, numMyNnz, maxNnz); MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors double imbalance = Teuchos::as<double>(maxNnz)/minNnz; if (imbalance > nonzeroImbalance) test4 = true; msg4 = "\n nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance); } if (!test3 && !test4) { GetOStream(Statistics0) << "Repartitioning? NO:" << msg3 + msg4 << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } GetOStream(Statistics0) << "Repartitioning? YES:" << msg3 + msg4 << std::endl; GO indexBase = rowMap->getIndexBase(); Xpetra::UnderlyingLib lib = rowMap->lib(); int myRank = comm->getRank(); int numProcs = comm->getSize(); RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm); TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object."); RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm(); // ====================================================================================================== // Calculate number of partitions // ====================================================================================================== // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML) // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold. GO numPartitions; if (currentLevel.IsAvailable("number of partitions")) { numPartitions = currentLevel.Get<GO>("number of partitions"); GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl; } else { if (Teuchos::as<GO>(A->getGlobalNumRows()) < minRowsPerProcessor) { // System is too small, migrate it to a single processor numPartitions = 1; } else { // Make sure that each processor has approximately minRowsPerProcessor numPartitions = A->getGlobalNumRows() / minRowsPerProcessor; } numPartitions = std::min(numPartitions, Teuchos::as<GO>(numProcs)); currentLevel.Set("number of partitions", numPartitions, NoFactory::get()); } GetOStream(Statistics0) << "Number of partitions to use = " << numPartitions << std::endl; // ====================================================================================================== // Construct decomposition vector // ====================================================================================================== RCP<GOVector> decomposition; if (numPartitions == 1) { // Trivial case: decomposition is the trivial one, all zeros. We skip the call to Zoltan_Interface // (this is mostly done to avoid extra output messages, as even if we didn't skip there is a shortcut // in Zoltan[12]Interface). // TODO: We can probably skip more work in this case (like building all extra data structures) GetOStream(Warnings0) << "Only one partition: Skip call to the repartitioner." << std::endl; decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), true); } else { decomposition = Get<RCP<GOVector> >(currentLevel, "Partition"); if (decomposition.is_null()) { GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } } // ====================================================================================================== // Remap if necessary // ====================================================================================================== // From a user perspective, we want user to not care about remapping, thinking of it as only a performance feature. // There are two problems, however. // (1) Next level aggregation depends on the order of GIDs in the vector, if one uses "natural" or "random" orderings. // This also means that remapping affects next level aggregation, despite the fact that the _set_ of GIDs for // each partition is the same. // (2) Even with the fixed order of GIDs, the remapping may influence the aggregation for the next-next level. // Let us consider the following example. Lets assume that when we don't do remapping, processor 0 would have // GIDs {0,1,2}, and processor 1 GIDs {3,4,5}, and if we do remapping processor 0 would contain {3,4,5} and // processor 1 {0,1,2}. Now, when we run repartitioning algorithm on the next level (say Zoltan1 RCB), it may // be dependent on whether whether it is [{0,1,2}, {3,4,5}] or [{3,4,5}, {0,1,2}]. Specifically, the tie-breaking // algorithm can resolve these differently. For instance, running // mpirun -np 5 ./MueLu_ScalingTestParamList.exe --xml=easy_sa.xml --nx=12 --ny=12 --nz=12 // with // <ParameterList name="MueLu"> // <Parameter name="coarse: max size" type="int" value="1"/> // <Parameter name="repartition: enable" type="bool" value="true"/> // <Parameter name="repartition: min rows per proc" type="int" value="2"/> // <ParameterList name="level 1"> // <Parameter name="repartition: remap parts" type="bool" value="false/true"/> // </ParameterList> // </ParameterList> // produces different repartitioning for level 2. // This different repartitioning may then escalate into different aggregation for the next level. // // We fix (1) by fixing the order of GIDs in a vector by sorting the resulting vector. // Fixing (2) is more complicated. // FIXME: Fixing (2) in Zoltan may not be enough, as we may use some arbitration in MueLu, // for instance with CoupledAggregation. What we really need to do is to use the same order of processors containing // the same order of GIDs. To achieve that, the newly created subcommunicator must be conforming with the order. For // instance, if we have [{0,1,2}, {3,4,5}], we create a subcommunicator where processor 0 gets rank 0, and processor 1 // gets rank 1. If, on the other hand, we have [{3,4,5}, {0,1,2}], we assign rank 1 to processor 0, and rank 0 to processor 1. // This rank permutation requires help from Epetra/Tpetra, both of which have no such API in place. // One should also be concerned that if we had such API in place, rank 0 in subcommunicator may no longer be rank 0 in // MPI_COMM_WORLD, which may lead to issues for logging. if (remapPartitions) { SubFactoryMonitor m1(*this, "DeterminePartitionPlacement", currentLevel); DeterminePartitionPlacement(*A, *decomposition, numPartitions); } // ====================================================================================================== // Construct importer // ====================================================================================================== // At this point, the following is true: // * Each processors owns 0 or 1 partitions // * If a processor owns a partition, that partition number is equal to the processor rank // * The decomposition vector contains the partitions ids that the corresponding GID belongs to ArrayRCP<const GO> decompEntries; if (decomposition->getLocalLength() > 0) decompEntries = decomposition->getData(0); #ifdef HAVE_MUELU_DEBUG // Test range of partition ids int incorrectRank = -1; for (int i = 0; i < decompEntries.size(); i++) if (decompEntries[i] >= numProcs || decompEntries[i] < 0) { incorrectRank = myRank; break; } int incorrectGlobalRank = -1; MueLu_maxAll(comm, incorrectRank, incorrectGlobalRank); TEUCHOS_TEST_FOR_EXCEPTION(incorrectGlobalRank >- 1, Exceptions::RuntimeError, "pid " + Teuchos::toString(incorrectGlobalRank) + " encountered a partition number is that out-of-range"); #endif Array<GO> myGIDs; myGIDs.reserve(decomposition->getLocalLength()); // Step 0: Construct mapping // part number -> GIDs I own which belong to this part // NOTE: my own part GIDs are not part of the map typedef std::map<GO, Array<GO> > map_type; map_type sendMap; for (LO i = 0; i < decompEntries.size(); i++) { GO id = decompEntries[i]; GO GID = rowMap->getGlobalElement(i); if (id == myRank) myGIDs .push_back(GID); else sendMap[id].push_back(GID); } decompEntries = Teuchos::null; if (IsPrint(Statistics2)) { GO numLocalKept = myGIDs.size(), numGlobalKept, numGlobalRows = A->getGlobalNumRows(); MueLu_sumAll(comm,numLocalKept, numGlobalKept); GetOStream(Statistics2) << "Unmoved rows: " << numGlobalKept << " / " << numGlobalRows << " (" << 100*Teuchos::as<double>(numGlobalKept)/numGlobalRows << "%)" << std::endl; } int numSend = sendMap.size(), numRecv; // Arrayify map keys Array<GO> myParts(numSend), myPart(1); int cnt = 0; myPart[0] = myRank; for (typename map_type::const_iterator it = sendMap.begin(); it != sendMap.end(); it++) myParts[cnt++] = it->first; // Step 1: Find out how many processors send me data // partsIndexBase starts from zero, as the processors ids start from zero GO partsIndexBase = 0; RCP<Map> partsIHave = MapFactory ::Build(lib, Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), myParts(), partsIndexBase, comm); RCP<Map> partsIOwn = MapFactory ::Build(lib, numProcs, myPart(), partsIndexBase, comm); RCP<Export> partsExport = ExportFactory::Build(partsIHave, partsIOwn); RCP<GOVector> partsISend = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIHave); RCP<GOVector> numPartsIRecv = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIOwn); if (numSend) { ArrayRCP<GO> partsISendData = partsISend->getDataNonConst(0); for (int i = 0; i < numSend; i++) partsISendData[i] = 1; } (numPartsIRecv->getDataNonConst(0))[0] = 0; numPartsIRecv->doExport(*partsISend, *partsExport, Xpetra::ADD); numRecv = (numPartsIRecv->getData(0))[0]; // Step 2: Get my GIDs from everybody else MPI_Datatype MpiType = MpiTypeTraits<GO>::getType(); int msgTag = 12345; // TODO: use Comm::dup for all internal messaging // Post sends Array<MPI_Request> sendReqs(numSend); cnt = 0; for (typename map_type::iterator it = sendMap.begin(); it != sendMap.end(); it++) MPI_Isend(static_cast<void*>(it->second.getRawPtr()), it->second.size(), MpiType, Teuchos::as<GO>(it->first), msgTag, *rawMpiComm, &sendReqs[cnt++]); map_type recvMap; size_t totalGIDs = myGIDs.size(); for (int i = 0; i < numRecv; i++) { MPI_Status status; MPI_Probe(MPI_ANY_SOURCE, msgTag, *rawMpiComm, &status); // Get rank and number of elements from status int fromRank = status.MPI_SOURCE, count; MPI_Get_count(&status, MpiType, &count); recvMap[fromRank].resize(count); MPI_Recv(static_cast<void*>(recvMap[fromRank].getRawPtr()), count, MpiType, fromRank, msgTag, *rawMpiComm, &status); totalGIDs += count; } // Do waits on send requests if (numSend) { Array<MPI_Status> sendStatuses(numSend); MPI_Waitall(numSend, sendReqs.getRawPtr(), sendStatuses.getRawPtr()); } // Merge GIDs myGIDs.reserve(totalGIDs); for (typename map_type::const_iterator it = recvMap.begin(); it != recvMap.end(); it++) { int offset = myGIDs.size(), len = it->second.size(); if (len) { myGIDs.resize(offset + len); memcpy(myGIDs.getRawPtr() + offset, it->second.getRawPtr(), len*sizeof(GO)); } } // NOTE 2: The general sorting algorithm could be sped up by using the knowledge that original myGIDs and all received chunks // (i.e. it->second) are sorted. Therefore, a merge sort would work well in this situation. std::sort(myGIDs.begin(), myGIDs.end()); // Step 3: Construct importer RCP<Map> newRowMap = MapFactory ::Build(lib, rowMap->getGlobalNumElements(), myGIDs(), indexBase, origComm); RCP<const Import> rowMapImporter; { SubFactoryMonitor m1(*this, "Import construction", currentLevel); rowMapImporter = ImportFactory::Build(rowMap, newRowMap); } Set(currentLevel, "Importer", rowMapImporter); // ====================================================================================================== // Print some data // ====================================================================================================== if (pL.get<bool>("repartition: print partition distribution") && IsPrint(Statistics2)) { // Print the grid of processors GetOStream(Statistics2) << "Partition distribution over cores (ownership is indicated by '+')" << std::endl; char amActive = (myGIDs.size() ? 1 : 0); std::vector<char> areActive(numProcs, 0); MPI_Gather(&amActive, 1, MPI_CHAR, &areActive[0], 1, MPI_CHAR, 0, *rawMpiComm); int rowWidth = std::min(Teuchos::as<int>(ceil(sqrt(numProcs))), 100); for (int proc = 0; proc < numProcs; proc += rowWidth) { for (int j = 0; j < rowWidth; j++) if (proc + j < numProcs) GetOStream(Statistics2) << (areActive[proc + j] ? "+" : "."); else GetOStream(Statistics2) << " "; GetOStream(Statistics2) << " " << proc << ":" << std::min(proc + rowWidth, numProcs) - 1 << std::endl; } } } // Build
/** * This method contains most of the MPI code that coordinates the efforts * among the crawlers. This method doesn't return until the root MPI process * recieves a SIGTERM signal. * * One of the crawlers is designated the root crawler based on its MPI rank. * The root crawler instantiates a KeyspaceMapping object to coordinate the * keyspace mapping among the crawlers. In reality the root crawler forks its * tripcode generating thread and continues to listen for KeyspacePool * requests in the main thread. * * When the root crawler recieves a SIGTERM signal, it signals all of the * crawlers to finish their current pools and optionally serialize the * KeyspaceMapping object to disk to allow for the search to be resumed in the * future. * * \fixme Catching the SIGTERM signal in a thread that makes MPI calls might * not be safe. See section 2.9.2 of the MPI specification. */ void TripcodeCrawler::run() { int worldRank, worldSize; MPI_Comm_rank(MPI_COMM_WORLD, &worldRank); MPI_Comm_size(MPI_COMM_WORLD, &worldSize); if(worldRank == ROOT_RANK) { /// \todo Spawn a thread so the root process can compute tripcodes and /// coordinate the threads at the same time. while(true) { cout << "doing things" << endl; MPI_Status status; // blocking receive for requests for keyspace pools MPI_Recv(NULL, 0, MPI_INT, MPI_ANY_SOURCE, KEYSPACE_REQUEST, MPI_COMM_WORLD, &status); /// \todo Need to document the ownership of a lot of these buffers. // construct a KeyspacePool object suitable for serialization and // transmission assert(m_keyspaceMapping != NULL); KeyspacePool *keyspacePool = m_keyspaceMapping->getNextPool(); size_t poolDataSize; uint8_t *poolData = keyspacePool->serialize(&poolDataSize); // blocking response to keyspace pool request with serialized // KeyspacePool object MPI_Send(poolData, static_cast<int>(poolDataSize), MPI_BYTE, status.MPI_SOURCE, KEYSPACE_RESPONSE, MPI_COMM_WORLD); delete keyspacePool; delete poolData; } } else { while(true) { MPI_Status status; // request a new keyspace pool MPI_Send(NULL, 0, MPI_INT, ROOT_RANK, KEYSPACE_REQUEST, MPI_COMM_WORLD); // recieve the serialized KeyspacePool object MPI_Probe(ROOT_RANK, KEYSPACE_RESPONSE, MPI_COMM_WORLD, &status); size_t poolDataSize; MPI_Get_count(&status, MPI_BYTE, reinterpret_cast<int*>(poolDataSize)); uint8_t *poolData = new uint8_t[poolDataSize]; MPI_Recv(poolData, static_cast<int>(poolDataSize), MPI_BYTE, ROOT_RANK, KEYSPACE_RESPONSE, MPI_COMM_WORLD, &status); KeyspacePool *keyspacePool = KeyspacePoolFactory::singleton()->createKeyspacePool(poolData, poolDataSize); delete poolData; /// \todo We might want to explore the speed benefit /// of a custom memory allocater here and a few other places. TripcodeContainer tripcodes, matches; KeyBlock *currentBlock; while((currentBlock = keyspacePool->getNextBlock()) != NULL) { m_tripcodeAlgorithm->computeTripcodes(currentBlock, &tripcodes); m_matchingAlgorithm->matchTripcodes(&tripcodes, &matches); } // TODO: send TripcodeSearchResult to ROOT_RANK delete keyspacePool; // TODO: check for termination signal } } }
int main(int argc, char** argv) { srand(time(NULL)); int numtasks, worldrank, rank, i, tag=1; int coords[2]; int dims[2], periods[2]={0, 0}, reorder=0; int cart_coords[2]; int target_rank; dims[0] = X; dims[1] = Y; // Two arrays of fish, one for the whole (only used in initialization // and the other to hold each process's groups) fish_group my_groups[POPULATION]; int num_fish_in_cell = 0; net nets[NETS]; MPI_Comm cartcomm; // createDimensions(dims, SIZE); MPI_Init(&argc, &argv); create_mpi_datatypes(); // Get rank and size of the world MPI_Comm_size(MPI_COMM_WORLD, &numtasks); MPI_Comm_rank(MPI_COMM_WORLD, &worldrank); MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, reorder, &cartcomm); MPI_Comm_rank(cartcomm, &rank); MPI_Cart_coords(cartcomm, rank, 2, coords); if(rank == 0){ // Populate the world printf("World: %dx%d (size:%dx%d)\n", dims[0], dims[1], WORLD_HEIGHT, WORLD_WIDTH); populate(my_groups, POPULATION, WORLD_WIDTH, WORLD_HEIGHT); num_fish_in_cell = POPULATION; for(i = 0; i < POPULATION; i++){ printf("fish: %d, x=%d, y=%d\n", my_groups[i].num, my_groups[i].x, my_groups[i].y); } for(i = 0; i < NETS; i++){ nets[i] = get_net(WORLD_WIDTH, WORLD_HEIGHT); printf("net: %d, x=%d, y=%d\n", i, nets[i].x, nets[i].y); } } MPI_Bcast(nets, NETS, mpi_net, 0, cartcomm); int j = 0; fish_group received[POPULATION]; MPI_Request recv_request; MPI_Irecv(received, POPULATION, mpi_fish_group, MPI_ANY_SOURCE, tag, cartcomm, &recv_request); while(j++ < ITERATIONS){ int sends = 0; int testdone, probedone; MPI_Request requests[numtasks]; // MPI_Status status[numtasks]; MPI_Status recv_status, probe_status; int send_counts[numtasks]; for(i = 0; i < numtasks; i++){ send_counts[i] = 0; } fish_group send_objects[numtasks][num_fish_in_cell]; for(i = 0; i < num_fish_in_cell; i++){ if(my_groups[i].num == 0){ remove_element(my_groups, i, num_fish_in_cell--); i--; continue; } get_cart_coords(cart_coords, &my_groups[i], WORLD_WIDTH, WORLD_HEIGHT, dims[0], dims[1]); MPI_Cart_rank(cartcomm, cart_coords, &target_rank); if(rank != target_rank){ int sc = send_counts[target_rank]; send_counts[target_rank] += 1; send_objects[target_rank][sc] = my_groups[i]; remove_element(my_groups, i, num_fish_in_cell--); i--; } } for(i = 0; i < numtasks; i++){ if(send_counts[i] > 0){ // printf("Rank %d: Sending %d groups to %d in loop %d\n", rank, send_counts[i], i, j); MPI_Isend(send_objects[i], send_counts[i], mpi_fish_group, i, tag, cartcomm, &requests[sends++]); // MPI_Send(send_objects[i], send_counts[i], mpi_fish_group, // i, tag, cartcomm); } } MPI_Barrier(cartcomm); struct timespec tim; tim.tv_sec = 0; // tim.tv_nsec = 500000000L; tim.tv_nsec = 50000000L; nanosleep(&tim, NULL); MPI_Iprobe(MPI_ANY_SOURCE, tag, cartcomm, &probedone, &probe_status); MPI_Test(&recv_request, &testdone, &recv_status); int count = 0; while(testdone){ MPI_Get_count(&recv_status, mpi_fish_group, &count); for(i = 0; i < count; i++){ my_groups[num_fish_in_cell++] =received[i]; } MPI_Irecv(received, POPULATION, mpi_fish_group, MPI_ANY_SOURCE, tag, cartcomm, &recv_request); MPI_Iprobe(MPI_ANY_SOURCE, tag, cartcomm, &probedone, &probe_status); MPI_Test(&recv_request, &testdone, &recv_status); } if(OUTPUT){ for(i=0; i < num_fish_in_cell; i++){ printf("--fish-%d-%d-%d-%d-%d\n", j, my_groups[i].num, my_groups[i].x, my_groups[i].y, rank); } for(i=0; i < NETS; i++){ printf("--net-%d-%d-%d-%d-%d-%d\n", j, i, nets[i].fish, NET_SIZE, nets[i].x, nets[i].y); } } int last_catch[NETS]; if(rank == 0){ for(i=0; i < NETS; i++){ last_catch[i] = nets[i].fish; } } // Update the x,y position of every group according to it's movement speed. update(my_groups, num_fish_in_cell, nets, NETS); int recvnets = NETS*numtasks; net temp_nets[recvnets]; MPI_Gather(nets, NETS, mpi_net, temp_nets, NETS, mpi_net, 0, cartcomm); if(rank == 0){ int new_catch[NETS]; for(i=0; i < NETS; i++){ new_catch[i] = 0; } for(i=0; i < recvnets; i++){ int ind = i % NETS; int diff = temp_nets[i].fish - last_catch[ind]; new_catch[ind] += diff; } for(i=0; i < NETS; i++){ nets[i].fish += new_catch[i]; } } MPI_Bcast(nets, NETS, mpi_net, 0, cartcomm); } MPI_Finalize(); return 0; }
static int scr_swap_files_move( int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send, int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv, MPI_Comm comm) { int rc = SCR_SUCCESS; MPI_Request request[2]; MPI_Status status[2]; /* allocate MPI send buffer */ char *buf_send = NULL; if (have_outgoing) { buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_send == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* allocate MPI recv buffer */ char *buf_recv = NULL; if (have_incoming) { buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_recv == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* since we'll overwrite our send file in place with the recv file, * which may be larger, we need to keep track of how many bytes we've * sent and whether we've sent them all */ unsigned long filesize_send = 0; /* open our file */ int fd = -1; if (have_outgoing) { /* we'll overwrite our send file (or just read it if there is no incoming) */ filesize_send = scr_file_size(file_send); fd = scr_open(file_send, O_RDWR); if (fd < 0) { /* TODO: skip writes and return error? */ scr_abort(-1, "Opening file for send/recv: scr_open(%s, O_RDWR) errno=%d %s @ %s:%d", file_send, errno, strerror(errno), __FILE__, __LINE__ ); } } else if (have_incoming) { /* if we're in this branch, then we only have an incoming file, * so we'll write our recv file from scratch */ mode_t mode_file = scr_getmode(1, 1, 0); fd = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd < 0) { /* TODO: skip writes and return error? */ scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d", file_recv, errno, strerror(errno), __FILE__, __LINE__ ); } } /* exchange file chunks */ int sending = 0; if (have_outgoing) { sending = 1; } int receiving = 0; if (have_incoming) { receiving = 1; } int nread, nwrite; off_t read_pos = 0, write_pos = 0; while (sending || receiving) { if (receiving) { /* prepare a buffer to receive up to scr_mpi_buf_size bytes */ MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]); } if (sending) { /* compute number of bytes to read */ unsigned long count = filesize_send - read_pos; if (count > scr_mpi_buf_size) { count = scr_mpi_buf_size; } /* read a chunk of up to scr_mpi_buf_size bytes into buf_send */ lseek(fd, read_pos, SEEK_SET); /* seek to read position */ nread = scr_read(file_send, fd, buf_send, count); if (scr_crc_on_copy && nread > 0) { *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread); } if (nread < 0) { nread = 0; } read_pos += (off_t) nread; /* update read pointer */ /* send chunk (if nread is smaller than scr_mpi_buf_size, * then we've read the whole file) */ MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]); MPI_Wait(&request[1], &status[1]); /* check whether we've read the whole file */ if (filesize_send == read_pos && count < scr_mpi_buf_size) { sending = 0; } } if (receiving) { /* count the number of bytes received */ MPI_Wait(&request[0], &status[0]); MPI_Get_count(&status[0], MPI_BYTE, &nwrite); if (scr_crc_on_copy && nwrite > 0) { *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite); } /* write those bytes to file (if nwrite is smaller than scr_mpi_buf_size, * then we've received the whole file) */ lseek(fd, write_pos, SEEK_SET); /* seek to write position */ scr_write(file_recv, fd, buf_recv, nwrite); write_pos += (off_t) nwrite; /* update write pointer */ /* if nwrite is smaller than scr_mpi_buf_size, * then assume we've received the whole file */ if (nwrite < scr_mpi_buf_size) { receiving = 0; } } } /* close file and cleanup */ if (have_outgoing && have_incoming) { /* sent and received a file; close it, truncate it to corect size, rename it */ scr_close(file_send, fd); truncate(file_send, write_pos); rename(file_send, file_recv); } else if (have_outgoing) { /* only sent a file; close it, delete it, and remove its completion marker */ scr_close(file_send, fd); scr_file_unlink(file_send); } else if (have_incoming) { /* only received a file; just need to close it */ scr_close(file_recv, fd); } if (scr_crc_on_copy && have_outgoing) { uLong meta_send_crc; if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) { /* we transfer this meta data across below, * so may as well update these fields so we can use them */ scr_meta_set_crc32(meta_send, *crc32_send); /* do not complete file send, we just deleted it above */ } else { /* TODO: we could check that the crc on the sent file matches and take some action if not */ } } /* free the MPI buffers */ scr_align_free(&buf_recv); scr_align_free(&buf_send); return rc; }
static cvmrecord_t *sliceCVM(const char *cvm_flatfile) { cvmrecord_t *cvmrecord; int32_t bufferedbytes, bytecount, recordcount; if (myID == theGroupSize - 1) { /* the last processor reads data and distribute to other processors*/ struct timeval starttime, endtime; float iotime = 0, memmovetime = 0; MPI_Request *isendreqs; MPI_Status *isendstats; FILE *fp; int fd, procid; struct stat statbuf; void *maxbuf; const point_t *intervaltable; off_t bytesent; int32_t offset; const int maxcount = (1 << 29) / sizeof(cvmrecord_t); const int maxbufsize = maxcount * sizeof(cvmrecord_t); fp = fopen(cvm_flatfile, "r"); if (fp == NULL) { fprintf(stderr, "Thread %d: Cannot open flat CVM file\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } fd = fileno(fp); if (fstat(fd, &statbuf) != 0) { fprintf(stderr, "Thread %d: Cannot get the status of CVM file\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } intervaltable = octor_getintervaltable(myOctree); /* for (procid = 0; procid <= myID; procid++) { fprintf(stderr, "interval[%d] = {%d, %d, %d}\n", procid, intervaltable[procid].x << 1, intervaltable[procid].y << 1, intervaltable[procid].z << 1); } */ bytesent = 0; maxbuf = malloc(maxbufsize) ; if (maxbuf == NULL) { fprintf(stderr, "Thread %d: Cannot allocate send buffer\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } isendreqs = (MPI_Request *)malloc(sizeof(MPI_Request) * theGroupSize); isendstats = (MPI_Status *)malloc(sizeof(MPI_Status) * theGroupSize); if ((isendreqs == NULL) || (isendstats == NULL)) { fprintf(stderr, "Thread %d: Cannot allocate isend controls\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } /* Try to read max number of CVM records as allowed */ gettimeofday(&starttime, NULL); recordcount = fread(maxbuf, sizeof(cvmrecord_t), maxbufsize / sizeof(cvmrecord_t), fp); gettimeofday(&endtime, NULL); iotime += (endtime.tv_sec - starttime.tv_sec) * 1000.0 + (endtime.tv_usec - starttime.tv_usec) / 1000.0; if (recordcount != maxbufsize / sizeof(cvmrecord_t)) { fprintf(stderr, "Thread %d: Cannot read-init buffer\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } /* start with proc 0 */ procid = 0; while (procid < myID) { /* repeatedly fill the buffer */ point_t searchpoint, *point; int newreads; int isendcount = 0; /* we have recordcount to work with */ cvmrecord = (cvmrecord_t *)maxbuf; while (procid < myID) { /* repeatedly send out data */ searchpoint.x = intervaltable[procid + 1].x << 1; searchpoint.y = intervaltable[procid + 1].y << 1; searchpoint.z = intervaltable[procid + 1].z << 1; offset = zsearch(cvmrecord, recordcount, theCVMRecordSize, &searchpoint); point = (point_t *)(cvmrecord + offset); if ((point->x != searchpoint.x) || (point->y != searchpoint.y) || (point->z != searchpoint.z)) { break; } else { bytecount = offset * sizeof(cvmrecord_t); MPI_Isend(cvmrecord, bytecount, MPI_CHAR, procid, CVMRECORD_MSG, MPI_COMM_WORLD, &isendreqs[isendcount]); isendcount++; /* fprintf(stderr, "Procid = %d offset = %qd bytecount = %d\n", procid, (int64_t)bytesent, bytecount); */ bytesent += bytecount; /* prepare for the next processor */ recordcount -= offset; cvmrecord = (cvmrecord_t *)point; procid++; } } /* Wait till the data in the buffer has been sent */ MPI_Waitall(isendcount, isendreqs, isendstats); /* Move residual data to the beginning of the buffer and try to fill the newly free space */ bufferedbytes = sizeof(cvmrecord_t) * recordcount; gettimeofday(&starttime, NULL); memmove(maxbuf, cvmrecord, bufferedbytes); gettimeofday(&endtime, NULL); memmovetime += (endtime.tv_sec - starttime.tv_sec) * 1000.0 + (endtime.tv_usec - starttime.tv_usec) / 1000.0; gettimeofday(&starttime, NULL); newreads = fread((char *)maxbuf + bufferedbytes, sizeof(cvmrecord_t), maxcount - recordcount, fp); gettimeofday(&endtime, NULL); iotime += (endtime.tv_sec - starttime.tv_sec) * 1000.0 + (endtime.tv_usec - starttime.tv_usec) / 1000.0; recordcount += newreads; if (newreads == 0) break; } free(maxbuf); free(isendreqs); free(isendstats); /* I am supposed to accomodate the remaining octants */ bytecount = statbuf.st_size - bytesent; cvmrecord = (cvmrecord_t *)malloc(bytecount); if (cvmrecord == NULL) { fprintf(stderr, "Thread %d: out of memory\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } /* fseek exiting the for loop has file cursor propertly */ #ifdef BIGBEN if (fseek(fp, bytesent, SEEK_SET) != 0) { fprintf(stderr, "Thread %d: fseeko failed\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } #else if (fseeko(fp, bytesent, SEEK_SET) != 0) { fprintf(stderr, "Thread %d: fseeko failed\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } #endif gettimeofday(&starttime, NULL); if (fread(cvmrecord, 1, bytecount, fp) != (size_t)bytecount) { fprintf(stderr, "Thread %d: fail to read the last chunk\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } gettimeofday(&endtime, NULL); iotime += (endtime.tv_sec - starttime.tv_sec) * 1000.0 + (endtime.tv_usec - starttime.tv_usec) / 1000.0; /* fprintf(stderr, "Procid = %d offset = %qd bytecount = %d\n", myID, (int64_t)bytesent, bytecount); */ fclose(fp); fprintf(stdout, "Read %s (%.2fMB) in %.2f seconds (%.2fMB/sec)\n", cvm_flatfile, (float)statbuf.st_size / (1 << 20), iotime / 1000, (float)statbuf.st_size / (1 << 20) / (iotime / 1000)); fprintf(stdout, "Memmove takes %.2f seconds\n", (float)memmovetime / 1000); } else { /* wait for my turn till PE(n - 1) tells me to go ahead */ MPI_Status status; MPI_Probe(theGroupSize - 1, CVMRECORD_MSG, MPI_COMM_WORLD, &status); MPI_Get_count(&status, MPI_CHAR, &bytecount); cvmrecord = (cvmrecord_t *)malloc(bytecount); if (cvmrecord == NULL) { fprintf(stderr, "Thread %d: out of memory\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } MPI_Recv(cvmrecord, bytecount, MPI_CHAR, theGroupSize - 1, CVMRECORD_MSG, MPI_COMM_WORLD, &status); } /* Every processor should set these parameters correctly */ theCVMRecordCount = bytecount / sizeof(cvmrecord_t); if (theCVMRecordCount * sizeof(cvmrecord_t) != (size_t)bytecount) { fprintf(stderr, "Thread %d: received corrupted CVM data\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } return cvmrecord; }
uint32_t MP_recv(uint32_t maxlength, StgWord8 *destination, OpCode *retcode, PEId *sender) { /* MPI: Use MPI_Probe to get size, sender and code; check maxlength; * receive required data size (must be known when receiving in * MPI) * No special buffer is needed here, can receive into *destination. */ int source, size, code; int haveSysMsg = rtsFalse; code = MIN_PEOPS-1; IF_PAR_DEBUG(mpcomm, debugBelch("MP_recv for MPI.\n")); // priority for system messages, probed before accepting anything // non-blocking probe, MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, sysComm, &haveSysMsg, &status); // blocking probe for other message, returns source and tag in status if (!haveSysMsg) { // there is no system message: get metadata of first msg on MPI_COMM_WORLD MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); source = status.MPI_SOURCE; code = status.MPI_TAG; } else { // there is a system message: // still need to probe on MPI_COMM_WORLD for the system message to // get status of the message and the messages size source = status.MPI_SOURCE; code = status.MPI_TAG; MPI_Probe(source, code, MPI_COMM_WORLD, &status); } if (status.MPI_ERROR != MPI_SUCCESS) { debugBelch("MPI: Error receiving message.\n"); barf("PE %d aborting execution.\n", thisPE); } // get and check msg. size // size = status.st_length; MPI_Get_count(&status, MPI_BYTE, &size); if (maxlength < (uint32_t)size) barf("wrong MPI message length (%d, too big)!!!", size); MPI_Recv(destination, size, MPI_BYTE, source, code, MPI_COMM_WORLD, &status); *retcode = status.MPI_TAG; *sender = 1+status.MPI_SOURCE; // If we received a sys-message on COMM_WORLD, we need to receive it // also on sysComm: // if MPI_Iprobe on sysComm has failed, a sysMessage might have // arived later - // don't use haveSysMsg for the decision, use ISSYSCODE(code) instead. if (ISSYSCODE(code)) { MPI_Recv(&pingMessage2, 1, MPI_INT, source, code, sysComm, &status); if (code == PP_FINISH) { finishRecvd++; } } IF_PAR_DEBUG(mpcomm, debugBelch("MPI Message from PE %d with code %d.\n", *sender, *retcode)); ASSERT(*sender == (PEId)source+1 && *retcode == (OpCode) code); return (uint32_t) size; }
int main(int argc, char **argv) { mesh_t *mesh; double double_message[5]; double x, y, z, factor = 0, vscut = 0; double elapsedtime; #ifndef NO_OUTPUT int32_t eindex; int32_t remains, batch, idx; mrecord_t *partTable; #endif MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myID); MPI_Comm_size(MPI_COMM_WORLD, &theGroupSize); /* Read commandline arguments */ if (argc != 5) { if (myID == 0) { fprintf(stderr, "usage: qmesh cvmdb physics.in numerical.in "); fprintf(stderr, "meshdb\n"); fprintf(stderr, "cvmdb: path to an etree database or a flat file.\n"); fprintf(stderr, "physics.in: path to physics.in.\n"); fprintf(stderr, "numerical.in: path to numerical.in.\n"); fprintf(stderr, "meshetree: path to the output mesh etree.\n"); fprintf(stderr, "\n"); } MPI_Finalize(); return -1; } /* Change the working directory to $LOCAL */ /* localpath = getenv("LOCAL"); if (localpath == NULL) { fprintf(stderr, "Thread %d: Cannot get $LOCAL value\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } if (chdir(localpath) != 0) { fprintf(stderr, "Thread %d: Cannot chdir to %s\n", myID, localpath); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } */ /* Replicate the material database among the processors */ /* if ((theGroupSize - 1) / PROCPERNODE >= 1) { MPI_Comm replica_comm; if (myID % PROCPERNODE != 0) { MPI_Comm_split(MPI_COMM_WORLD, MPI_UNDEFINED, myID, &replica_comm); } else { int replica_id; off_t filesize, remains, batchsize; void *filebuf; int fd; MPI_Comm_split(MPI_COMM_WORLD, 0, myID, &replica_comm); MPI_Comm_rank(replica_comm, &replica_id); if (replica_id == 0) { struct stat statbuf; if (stat(argv[1], &statbuf) != 0) { fprintf(stderr, "Thread 0: Cannot get stat of %s\n", argv[1]); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } filesize = statbuf.st_size; } MPI_Bcast(&filesize, sizeof(off_t), MPI_CHAR, 0, replica_comm); if ((filebuf = malloc(FILEBUFSIZE)) == NULL) { fprintf(stderr, "Thread %d: run out of memory while ", myID); fprintf(stderr, "preparing to receive material database\n"); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } fd = (replica_id == 0) ? open(argv[1], O_RDONLY) : open(argv[1], O_CREAT|O_TRUNC|O_WRONLY, S_IRUSR|S_IWUSR); if (fd == -1) { fprintf(stderr, "Thread %d: Cannot create replica database\n", myID); perror("open"); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } remains = filesize; while (remains > 0) { batchsize = (remains > FILEBUFSIZE) ? FILEBUFSIZE : remains; if (replica_id == 0) { if (read(fd, filebuf, batchsize) != batchsize) { fprintf(stderr, "Thread 0: Cannot read database\n"); perror("read"); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } } MPI_Bcast(filebuf, batchsize, MPI_CHAR, 0, replica_comm); if (replica_id != 0) { if (write(fd, filebuf, batchsize) != batchsize) { fprintf(stderr, "Thread %d: Cannot write replica ", myID); fprintf(stderr, "database\n"); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } } remains -= batchsize; } if (close(fd) != 0) { fprintf(stderr, "Thread %d: cannot close replica database\n", myID); perror("close"); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } } MPI_Barrier(MPI_COMM_WORLD); } */ /* Initialize static global varialbes */ if (myID == 0) { /* Processor 0 reads the parameters */ if (initparameters(argv[2], argv[3], &x, &y, &z) != 0) { fprintf(stderr, "Thread %d: Cannot init parameters\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } factor = theFactor; vscut = theVsCut; double_message[0] = x; double_message[1] = y; double_message[2] = z; double_message[3] = factor; double_message[4] = vscut; /* fprintf(stderr, "&double_message[0] = %p\n", &double_message[0]); fprintf(stderr, "&double_message[4] = %p\n", &double_message[4]); fprintf(stderr, "Thread 0: %f %f %f %f %f\n", x, y, z, factor, vscut); */ } MPI_Bcast(double_message, 5, MPI_DOUBLE, 0, MPI_COMM_WORLD); x = double_message[0]; y = double_message[1]; z = double_message[2]; factor = double_message[3]; vscut = double_message[4]; theNorth_m = x; theEast_m = y; theDepth_m = z; /* printf("Thread %d: %f %f %f %f %f\n", myID, x, y, z, factor, vscut); */ theFactor = factor; theVsCut = vscut; MPI_Barrier(MPI_COMM_WORLD); elapsedtime = -MPI_Wtime(); if (myID == 0) { fprintf(stdout, "PE = %d, Freq = %.2f\n", theGroupSize, theFreq); fprintf(stdout, "-----------------------------------------------\n"); } /*---- Generate and partition an unstructured octree mesh ----*/ if (myID == 0) { fprintf(stdout, "octor_newtree ... "); } /* * RICARDO: Carful with new_octree parameters (cutoff_depth) */ myOctree = octor_newtree(x, y, z, sizeof(edata_t), myID, theGroupSize, MPI_COMM_WORLD, 0); if (myOctree == NULL) { fprintf(stderr, "Thread %d: fail to create octree\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } MPI_Barrier(MPI_COMM_WORLD); elapsedtime += MPI_Wtime(); if (myID == 0) { fprintf(stdout, "done.... %.2f seconds\n", elapsedtime); } #ifdef USECVMDB /* Open my copy of the material database */ theCVMEp = etree_open(argv[1], O_RDONLY, CVMBUFSIZE, 0, 0); if (!theCVMEp) { fprintf(stderr, "Thread %d: Cannot open CVM etree database %s\n", myID, argv[1]); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } #else /* Use flat data record file and distibute the data in memories */ elapsedtime = -MPI_Wtime(); if (myID == 0) { fprintf(stdout, "slicing CVM database ..."); } theCVMRecord = sliceCVM(argv[1]); MPI_Barrier(MPI_COMM_WORLD); elapsedtime += MPI_Wtime(); if (theCVMRecord == NULL) { fprintf(stderr, "Thread %d: Error obtaining the CVM records from %s\n", myID, argv[1]); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); }; if (myID == 0) { fprintf(stdout, "done.... %.2f seconds\n", elapsedtime); } #endif elapsedtime = -MPI_Wtime(); if (myID == 0) { fprintf(stdout, "octor_refinetree ..."); } if (octor_refinetree(myOctree, toexpand, setrec) != 0) { fprintf(stderr, "Thread %d: fail to refine octree\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } MPI_Barrier(MPI_COMM_WORLD); elapsedtime += MPI_Wtime(); if (myID == 0) { fprintf(stdout, "done.... %.2f seconds\n", elapsedtime); } elapsedtime = -MPI_Wtime(); if (myID == 0) { fprintf(stdout, "octor_balancetree ... "); } if (octor_balancetree(myOctree, setrec, 0) != 0) { /* no progressive meshing (ricardo) */ fprintf(stderr, "Thread %d: fail to balance octree\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } MPI_Barrier(MPI_COMM_WORLD); elapsedtime += MPI_Wtime(); if (myID == 0) { fprintf(stdout, "done.... %.2f seconds\n", elapsedtime); } #ifdef USECVMDB /* Close the material database */ etree_close(theCVMEp); #else free(theCVMRecord); #endif /* USECVMDB */ elapsedtime = -MPI_Wtime(); if (myID == 0) { fprintf(stdout, "octor_partitiontree ..."); } if (octor_partitiontree(myOctree, NULL) != 0) { fprintf(stderr, "Thread %d: fail to balance load\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } MPI_Barrier(MPI_COMM_WORLD); elapsedtime += MPI_Wtime(); if (myID == 0) { fprintf(stdout, "done.... %.2f seconds\n", elapsedtime); } elapsedtime = - MPI_Wtime(); if (myID == 0) { fprintf(stdout, "octor_extractmesh ... "); } mesh = octor_extractmesh(myOctree, NULL); if (mesh == NULL) { fprintf(stderr, "Thread %d: fail to extract mesh\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } MPI_Barrier(MPI_COMM_WORLD); elapsedtime += MPI_Wtime(); if (myID == 0) { fprintf(stdout, "done.... %.2f seconds\n", elapsedtime); } /* We can do without the octree now */ octor_deletetree(myOctree); /*---- Obtain and print the statistics of the mesh ----*/ if (myID == 0) { int64_t etotal, ntotal, dntotal; int32_t received, procid; int32_t *enumTable, *nnumTable, *dnnumTable; int32_t rcvtrio[3]; /* Allocate the arrays to hold the statistics */ enumTable = (int32_t *)malloc(sizeof(int32_t) * theGroupSize); nnumTable = (int32_t *)malloc(sizeof(int32_t) * theGroupSize); dnnumTable = (int32_t *)malloc(sizeof(int32_t) * theGroupSize); if ((enumTable == NULL) || (nnumTable == NULL) || (dnnumTable == NULL)) { fprintf(stderr, "Thread 0: out of memory\n"); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } /* Fill in my counts */ enumTable[0] = mesh->lenum; nnumTable[0] = mesh->lnnum; dnnumTable[0] = mesh->ldnnum; /* Initialize sums */ etotal = mesh->lenum; ntotal = mesh->lnnum; dntotal = mesh->ldnnum; /* Fill in the rest of the tables */ received = 0; while (received < theGroupSize - 1) { int32_t fromwhom; MPI_Status status; MPI_Probe(MPI_ANY_SOURCE, STAT_MSG, MPI_COMM_WORLD, &status); fromwhom = status.MPI_SOURCE; MPI_Recv(rcvtrio, 3, MPI_INT, fromwhom, STAT_MSG, MPI_COMM_WORLD, &status); enumTable[fromwhom] = rcvtrio[0]; nnumTable[fromwhom] = rcvtrio[1]; dnnumTable[fromwhom] = rcvtrio[2]; etotal += rcvtrio[0]; ntotal += rcvtrio[1]; dntotal += rcvtrio[2]; received++; } fprintf(stdout, "Mesh statistics:\n"); fprintf(stdout, " Elements Nodes Danglings\n"); #ifdef ALPHA_TRU64UNIX_CC fprintf(stdout, "Total : %10ld%10ld %10ld\n\n", etotal, ntotal, dntotal); for (procid = 0; procid < theGroupSize; procid++) { fprintf(stdout, "Proc %5d: %10d%10d %10d\n", procid, enumTable[procid], nnumTable[procid], dnnumTable[procid]); } #else fprintf(stdout, "Total : %10qd%10qd %10qd\n\n", etotal, ntotal, dntotal); for (procid = 0; procid < theGroupSize; procid++) { fprintf(stdout, "Proc %5d: %10d%10d %10d\n", procid, enumTable[procid], nnumTable[procid], dnnumTable[procid]); } #endif free(enumTable); free(nnumTable); free(dnnumTable); } else { int32_t sndtrio[3]; sndtrio[0] = mesh->lenum; sndtrio[1] = mesh->lnnum; sndtrio[2] = mesh->ldnnum; MPI_Send(sndtrio, 3, MPI_INT, 0, STAT_MSG, MPI_COMM_WORLD); } #ifndef NO_OUTPUT /*---- Join elements and nodes, and send to Thread 0 for output */ /* Allocate a fixed size buffer space to store the join results */ partTable = (mrecord_t *)calloc(BATCH, sizeof(mrecord_t)); if (partTable == NULL) { fprintf(stderr, "Thread %d: out of memory\n", myID); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } if (myID == 0) { char *mEtree; etree_t *mep; int32_t procid; mEtree = argv[4]; mep = etree_open(mEtree, O_CREAT|O_RDWR|O_TRUNC, 0, sizeof(mdata_t),3); if (mep == NULL) { fprintf(stderr, "Thread 0: cannot create mesh etree\n"); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } /* Begin an appending operation */ if (etree_beginappend(mep, 1) != 0) { fprintf(stderr, "Thread 0: cannot begin an append operation\n"); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } eindex = 0; while (eindex < mesh->lenum) { remains = mesh->lenum - eindex; batch = (remains < BATCH) ? remains : BATCH; for (idx = 0; idx < batch; idx++) { mrecord_t *mrecord; int32_t whichnode; int32_t localnid0; mrecord = &partTable[idx]; /* Fill the address field */ localnid0 = mesh->elemTable[eindex].lnid[0]; mrecord->addr.x = mesh->nodeTable[localnid0].x; mrecord->addr.y = mesh->nodeTable[localnid0].y; mrecord->addr.z = mesh->nodeTable[localnid0].z; mrecord->addr.level = mesh->elemTable[eindex].level; mrecord->addr.type = ETREE_LEAF; /* Find the global node ids for the vertices */ for (whichnode = 0; whichnode < 8; whichnode++) { int32_t localnid; int64_t globalnid; localnid = mesh->elemTable[eindex].lnid[whichnode]; globalnid = mesh->nodeTable[localnid].gnid; mrecord->mdata.nid[whichnode] = globalnid; } memcpy(&mrecord->mdata.edgesize, mesh->elemTable[eindex].data, sizeof(edata_t)); eindex++; } /* for a batch */ if (bulkload(mep, partTable, batch) != 0) { fprintf(stderr, "Thread 0: Error bulk-loading data\n"); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } } /* for all the elements Thread 0 has */ /* Receive data from other processors */ for (procid = 1; procid < theGroupSize; procid++) { MPI_Status status; int32_t rcvbytecount; /* Signal the next processor to go ahead */ MPI_Send(NULL, 0, MPI_CHAR, procid, GOAHEAD_MSG, MPI_COMM_WORLD); while (1) { MPI_Probe(procid, MESH_MSG, MPI_COMM_WORLD, &status); MPI_Get_count(&status, MPI_CHAR, &rcvbytecount); batch = rcvbytecount / sizeof(mrecord_t); if (batch == 0) { /* Done */ break; } MPI_Recv(partTable, rcvbytecount, MPI_CHAR, procid, MESH_MSG, MPI_COMM_WORLD, &status); if (bulkload(mep, partTable, batch) != 0) { fprintf(stderr, "Thread 0: Cannot bulk-load data from "); fprintf(stderr, "Thread %d\n", procid); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } } /* while there is more data to be received from procid */ } /* for all the processors */ /* End the appending operation */ etree_endappend(mep); /* Close the mep to ensure the data is on disk */ if (etree_close(mep) != 0) { fprintf(stderr, "Thread 0: Cannot close the etree database\n"); MPI_Abort(MPI_COMM_WORLD, ERROR); exit(1); } } else { /* Processors other than 0 needs to send data to 0 */ int32_t sndbytecount; MPI_Status status; /* Wait for my turn */ MPI_Recv(NULL, 0, MPI_CHAR, 0, GOAHEAD_MSG, MPI_COMM_WORLD, &status); eindex = 0; while (eindex < mesh->lenum) { remains = mesh->lenum - eindex; batch = (remains < BATCH) ? remains : BATCH; for (idx = 0; idx < batch; idx++) { mrecord_t *mrecord; int32_t whichnode; int32_t localnid0; mrecord = &partTable[idx]; /* Fill the address field */ localnid0 = mesh->elemTable[eindex].lnid[0]; mrecord->addr.x = mesh->nodeTable[localnid0].x; mrecord->addr.y = mesh->nodeTable[localnid0].y; mrecord->addr.z = mesh->nodeTable[localnid0].z; mrecord->addr.level = mesh->elemTable[eindex].level; mrecord->addr.type = ETREE_LEAF; /* Find the global node ids for the vertices */ for (whichnode = 0; whichnode < 8; whichnode++) { int32_t localnid; int64_t globalnid; localnid = mesh->elemTable[eindex].lnid[whichnode]; globalnid = mesh->nodeTable[localnid].gnid; mrecord->mdata.nid[whichnode] = globalnid; } memcpy(&mrecord->mdata.edgesize, mesh->elemTable[eindex].data, sizeof(edata_t)); eindex++; } /* for a batch */ /* Send data to proc 0 */ sndbytecount = batch * sizeof(mrecord_t); MPI_Send(partTable, sndbytecount, MPI_CHAR, 0, MESH_MSG, MPI_COMM_WORLD); } /* While there is data left to be sent */ /* Send an empty message to indicate the end of my transfer */ MPI_Send(NULL, 0, MPI_CHAR, 0, MESH_MSG, MPI_COMM_WORLD); } /* Free the memory for the partial join results */ free(partTable); #endif octor_deletemesh(mesh); MPI_Finalize(); return 0; }
static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C,PetscInt is_max,IS is[]) { Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; PetscErrorCode ierr; PetscMPIInt size,rank,tag1,tag2,*len_s,nrqr,nrqs,*id_r1,*len_r1,flag,len; const PetscInt *idx_i; PetscInt idx,isz,col,*n,*data1,**data1_start,*data2,*data2_i,*data,*data_i, Mbs,i,j,k,*odata1,*odata2, proc_id,**odata2_ptr,*ctable=0,*btable,len_max,len_est; PetscInt proc_end=0,*iwork,len_unused,nodata2; PetscInt ois_max; /* max no of is[] in each of processor */ char *t_p; MPI_Comm comm; MPI_Request *s_waits1,*s_waits2,r_req; MPI_Status *s_status,r_status; PetscBT *table; /* mark indices of this processor's is[] */ PetscBT table_i; PetscBT otable; /* mark indices of other processors' is[] */ PetscInt bs=C->rmap->bs,Bn = c->B->cmap->n,Bnbs = Bn/bs,*Bowners; IS garray_local,garray_gl; PetscFunctionBegin; comm = ((PetscObject)C)->comm; size = c->size; rank = c->rank; Mbs = c->Mbs; ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); /* create tables used in step 1: table[i] - mark c->garray of proc [i] step 3: table[i] - mark indices of is[i] when whose=MINE table[0] - mark incideces of is[] when whose=OTHER */ len = PetscMax(is_max, size);CHKERRQ(ierr); ierr = PetscMalloc2(len,PetscBT,&table,(Mbs/PETSC_BITS_PER_BYTE+1)*len,char,&t_p);CHKERRQ(ierr); for (i=0; i<len; i++) { table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i; } ierr = MPI_Allreduce(&is_max,&ois_max,1,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr); /* 1. Send this processor's is[] to other processors */ /*---------------------------------------------------*/ /* allocate spaces */ ierr = PetscMalloc(is_max*sizeof(PetscInt),&n);CHKERRQ(ierr); len = 0; for (i=0; i<is_max; i++) { ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); len += n[i]; } if (!len) { is_max = 0; } else { len += 1 + is_max; /* max length of data1 for one processor */ } ierr = PetscMalloc((size*len+1)*sizeof(PetscInt),&data1);CHKERRQ(ierr); ierr = PetscMalloc(size*sizeof(PetscInt*),&data1_start);CHKERRQ(ierr); for (i=0; i<size; i++) data1_start[i] = data1 + i*len; ierr = PetscMalloc4(size,PetscInt,&len_s,size,PetscInt,&btable,size,PetscInt,&iwork,size+1,PetscInt,&Bowners);CHKERRQ(ierr); /* gather c->garray from all processors */ ierr = ISCreateGeneral(comm,Bnbs,c->garray,&garray_local);CHKERRQ(ierr); ierr = ISAllGather(garray_local, &garray_gl);CHKERRQ(ierr); ierr = ISDestroy(garray_local);CHKERRQ(ierr); ierr = MPI_Allgather(&Bnbs,1,MPIU_INT,Bowners+1,1,MPIU_INT,comm);CHKERRQ(ierr); Bowners[0] = 0; for (i=0; i<size; i++) Bowners[i+1] += Bowners[i]; if (is_max){ /* hash table ctable which maps c->row to proc_id) */ ierr = PetscMalloc(Mbs*sizeof(PetscInt),&ctable);CHKERRQ(ierr); for (proc_id=0,j=0; proc_id<size; proc_id++) { for (; j<C->rmap->range[proc_id+1]/bs; j++) { ctable[j] = proc_id; } } /* hash tables marking c->garray */ ierr = ISGetIndices(garray_gl,&idx_i); for (i=0; i<size; i++){ table_i = table[i]; ierr = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr); for (j = Bowners[i]; j<Bowners[i+1]; j++){ /* go through B cols of proc[i]*/ ierr = PetscBTSet(table_i,idx_i[j]);CHKERRQ(ierr); } } ierr = ISRestoreIndices(garray_gl,&idx_i);CHKERRQ(ierr); } /* if (is_max) */ ierr = ISDestroy(garray_gl);CHKERRQ(ierr); /* evaluate communication - mesg to who, length, and buffer space */ for (i=0; i<size; i++) len_s[i] = 0; /* header of data1 */ for (proc_id=0; proc_id<size; proc_id++){ iwork[proc_id] = 0; *data1_start[proc_id] = is_max; data1_start[proc_id]++; for (j=0; j<is_max; j++) { if (proc_id == rank){ *data1_start[proc_id] = n[j]; } else { *data1_start[proc_id] = 0; } data1_start[proc_id]++; } } for (i=0; i<is_max; i++) { ierr = ISGetIndices(is[i],&idx_i);CHKERRQ(ierr); for (j=0; j<n[i]; j++){ idx = idx_i[j]; *data1_start[rank] = idx; data1_start[rank]++; /* for local proccessing */ proc_end = ctable[idx]; for (proc_id=0; proc_id<=proc_end; proc_id++){ /* for others to process */ if (proc_id == rank ) continue; /* done before this loop */ if (proc_id < proc_end && !PetscBTLookup(table[proc_id],idx)) continue; /* no need for sending idx to [proc_id] */ *data1_start[proc_id] = idx; data1_start[proc_id]++; len_s[proc_id]++; } } /* update header data */ for (proc_id=0; proc_id<size; proc_id++){ if (proc_id== rank) continue; *(data1 + proc_id*len + 1 + i) = len_s[proc_id] - iwork[proc_id]; iwork[proc_id] = len_s[proc_id] ; } ierr = ISRestoreIndices(is[i],&idx_i);CHKERRQ(ierr); } nrqs = 0; nrqr = 0; for (i=0; i<size; i++){ data1_start[i] = data1 + i*len; if (len_s[i]){ nrqs++; len_s[i] += 1 + is_max; /* add no. of header msg */ } } for (i=0; i<is_max; i++) { ierr = ISDestroy(is[i]);CHKERRQ(ierr); } ierr = PetscFree(n);CHKERRQ(ierr); ierr = PetscFree(ctable);CHKERRQ(ierr); /* Determine the number of messages to expect, their lengths, from from-ids */ ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&nrqr);CHKERRQ(ierr); ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,len_s,&id_r1,&len_r1);CHKERRQ(ierr); /* Now post the sends */ ierr = PetscMalloc2(size,MPI_Request,&s_waits1,size,MPI_Request,&s_waits2);CHKERRQ(ierr); k = 0; for (proc_id=0; proc_id<size; proc_id++){ /* send data1 to processor [proc_id] */ if (len_s[proc_id]){ ierr = MPI_Isend(data1_start[proc_id],len_s[proc_id],MPIU_INT,proc_id,tag1,comm,s_waits1+k);CHKERRQ(ierr); k++; } } /* 2. Receive other's is[] and process. Then send back */ /*-----------------------------------------------------*/ len = 0; for (i=0; i<nrqr; i++){ if (len_r1[i] > len)len = len_r1[i]; } ierr = PetscFree(len_r1);CHKERRQ(ierr); ierr = PetscFree(id_r1);CHKERRQ(ierr); for (proc_id=0; proc_id<size; proc_id++) len_s[proc_id] = iwork[proc_id] = 0; ierr = PetscMalloc((len+1)*sizeof(PetscInt),&odata1);CHKERRQ(ierr); ierr = PetscMalloc(size*sizeof(PetscInt**),&odata2_ptr);CHKERRQ(ierr); ierr = PetscBTCreate(Mbs,otable);CHKERRQ(ierr); len_max = ois_max*(Mbs+1); /* max space storing all is[] for each receive */ len_est = 2*len_max; /* estimated space of storing is[] for all receiving messages */ ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); nodata2 = 0; /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */ odata2_ptr[nodata2] = odata2; len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max */ k = 0; while (k < nrqr){ /* Receive messages */ ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag1,comm,&flag,&r_status);CHKERRQ(ierr); if (flag){ ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr); proc_id = r_status.MPI_SOURCE; ierr = MPI_Irecv(odata1,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr); ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr); /* Process messages */ /* make sure there is enough unused space in odata2 array */ if (len_unused < len_max){ /* allocate more space for odata2 */ ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); odata2_ptr[++nodata2] = odata2; len_unused = len_est; } ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,odata1,OTHER,odata2,&otable);CHKERRQ(ierr); len = 1 + odata2[0]; for (i=0; i<odata2[0]; i++){ len += odata2[1 + i]; } /* Send messages back */ ierr = MPI_Isend(odata2,len,MPIU_INT,proc_id,tag2,comm,s_waits2+k);CHKERRQ(ierr); k++; odata2 += len; len_unused -= len; len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */ } } ierr = PetscFree(odata1);CHKERRQ(ierr); ierr = PetscBTDestroy(otable);CHKERRQ(ierr); /* 3. Do local work on this processor's is[] */ /*-------------------------------------------*/ /* make sure there is enough unused space in odata2(=data) array */ len_max = is_max*(Mbs+1); /* max space storing all is[] for this processor */ if (len_unused < len_max){ /* allocate more space for odata2 */ ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); odata2_ptr[++nodata2] = odata2; len_unused = len_est; } data = odata2; ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,data1_start[rank],MINE,data,table);CHKERRQ(ierr); ierr = PetscFree(data1_start);CHKERRQ(ierr); /* 4. Receive work done on other processors, then merge */ /*------------------------------------------------------*/ /* get max number of messages that this processor expects to recv */ ierr = MPI_Allreduce(len_s,iwork,size,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr); ierr = PetscMalloc((iwork[rank]+1)*sizeof(PetscInt),&data2);CHKERRQ(ierr); ierr = PetscFree4(len_s,btable,iwork,Bowners);CHKERRQ(ierr); k = 0; while (k < nrqs){ /* Receive messages */ ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag2,comm,&flag,&r_status); if (flag){ ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr); proc_id = r_status.MPI_SOURCE; ierr = MPI_Irecv(data2,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr); ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr); if (len > 1+is_max){ /* Add data2 into data */ data2_i = data2 + 1 + is_max; for (i=0; i<is_max; i++){ table_i = table[i]; data_i = data + 1 + is_max + Mbs*i; isz = data[1+i]; for (j=0; j<data2[1+i]; j++){ col = data2_i[j]; if (!PetscBTLookupSet(table_i,col)) {data_i[isz++] = col;} } data[1+i] = isz; if (i < is_max - 1) data2_i += data2[1+i]; } } k++; } } ierr = PetscFree(data2);CHKERRQ(ierr); ierr = PetscFree2(table,t_p);CHKERRQ(ierr); /* phase 1 sends are complete */ ierr = PetscMalloc(size*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} ierr = PetscFree(data1);CHKERRQ(ierr); /* phase 2 sends are complete */ if (nrqr){ierr = MPI_Waitall(nrqr,s_waits2,s_status);CHKERRQ(ierr);} ierr = PetscFree2(s_waits1,s_waits2);CHKERRQ(ierr); ierr = PetscFree(s_status);CHKERRQ(ierr); /* 5. Create new is[] */ /*--------------------*/ for (i=0; i<is_max; i++) { data_i = data + 1 + is_max + Mbs*i; ierr = ISCreateGeneral(PETSC_COMM_SELF,data[1+i],data_i,is+i);CHKERRQ(ierr); } for (k=0; k<=nodata2; k++){ ierr = PetscFree(odata2_ptr[k]);CHKERRQ(ierr); } ierr = PetscFree(odata2_ptr);CHKERRQ(ierr); PetscFunctionReturn(0); }
Foam::UIPstream::UIPstream(const int fromProcNo, PstreamBuffers& buffers) : UPstream(buffers.commsType_), Istream(buffers.format_, buffers.version_), fromProcNo_(fromProcNo), externalBuf_(buffers.recvBuf_[fromProcNo]), externalBufPosition_(buffers.recvBufPos_[fromProcNo]), tag_(buffers.tag_), clearAtEnd_(true), messageSize_(0) { if (commsType() != UPstream::scheduled && !buffers.finishedSendsCalled_) { FatalErrorIn("UIPstream::UIPstream(const int, PstreamBuffers&)") << "PstreamBuffers::finishedSends() never called." << endl << "Please call PstreamBuffers::finishedSends() after doing" << " all your sends (using UOPstream) and before doing any" << " receives (using UIPstream)" << Foam::exit(FatalError); } setOpened(); setGood(); if (commsType() == UPstream::nonBlocking) { // Message is already received into externalBuf messageSize_ = buffers.recvBuf_[fromProcNo].size(); } else { MPI_Status status; label wantedSize = externalBuf_.capacity(); if (debug) { Pout<< "UIPstream::UIPstream PstreamBuffers :" << " read from:" << fromProcNo << " tag:" << tag_ << " wanted size:" << wantedSize << Foam::endl; } // If the buffer size is not specified, probe the incomming message // and set it if (!wantedSize) { MPI_Probe(procID(fromProcNo_), tag_, MPI_COMM_WORLD, &status); MPI_Get_count(&status, MPI_BYTE, &messageSize_); externalBuf_.setCapacity(messageSize_); wantedSize = messageSize_; if (debug) { Pout<< "UIPstream::UIPstream PstreamBuffers : probed size:" << wantedSize << Foam::endl; } } messageSize_ = UIPstream::read ( commsType(), fromProcNo_, externalBuf_.begin(), wantedSize, tag_ ); // Set addressed size. Leave actual allocated memory intact. externalBuf_.setSize(messageSize_); if (!messageSize_) { FatalErrorIn ( "UIPstream::UIPstream(const int, PstreamBuffers&)" ) << "read failed" << Foam::abort(FatalError); } } }
int_t pdgstrf /************************************************************************/ ( superlu_options_t *options, int m, int n, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info ) /* * Purpose * ======= * * PDGSTRF performs the LU factorization in parallel. * * Arguments * ========= * * options (input) superlu_options_t* * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following field should be defined: * o ReplaceTinyPivot (yes_no_t) * Specifies whether to replace the tiny diagonals by * sqrt(epsilon)*norm(A) during LU factorization. * * m (input) int * Number of rows in the matrix. * * n (input) int * Number of columns in the matrix. * * anorm (input) double * The norm of the original matrix A, or the scaled A if * equilibration was done. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * The following fields should be defined: * * o Glu_persist (input) Glu_persist_t* * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (input/output) LocalLU_t* * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { #ifdef _CRAY _fcd ftcs = _cptofcd("N", strlen("N")); _fcd ftcs1 = _cptofcd("L", strlen("L")); _fcd ftcs2 = _cptofcd("N", strlen("N")); _fcd ftcs3 = _cptofcd("U", strlen("U")); #endif double alpha = 1.0, beta = 0.0; int_t *xsup; int_t *lsub, *lsub1, *usub, *Usub_buf, *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ double *lusup, *lusup1, *uval, *Uval_buf, *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, nlb, nub, nsupc, rel, rukp; int_t Pc, Pr; int iam, kcol, krow, mycol, myrow, pi, pj; int j, k, lk, nsupers; int nsupr, nbrow, segsize; int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: * 0 : transferred in Lsub_buf[] * 1 : transferred in Lval_buf[] * 2 : transferred in Usub_buf[] * 3 : transferred in Uval_buf[] */ int_t msg0, msg2; int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; double **Unzval_br_ptr, **Lnzval_bc_ptr; int_t *index; double *nzval; int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ double *ucol; int_t *indirect; double *tempv, *tempv2d; int_t iinfo; int_t *ToRecv, *ToSendD, **ToSendR; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; superlu_scope_t *scp; float s_eps; double thresh; double *tempU2d, *tempu; int full, ldt, ldu, lead_zero, ncols; MPI_Request recv_req[4], *send_req; MPI_Status status; #if ( DEBUGlevel>=2 ) int_t num_copy=0, num_update=0; #endif #if ( PRNTlevel==3 ) int_t zero_msg = 0, total_msg = 0; #endif #if ( PROFlevel>=1 ) double t1, t2; float msg_vol = 0, msg_cnt = 0; int_t iword = sizeof(int_t), dword = sizeof(double); #endif /* Test the input parameters. */ *info = 0; if ( m < 0 ) *info = -2; else if ( n < 0 ) *info = -3; if ( *info ) { pxerbla("pdgstrf", grid, -*info); return (-1); } /* Quick return if possible. */ if ( m == 0 || n == 0 ) return 0; /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; s_eps = slamch_("Epsilon"); thresh = s_eps * anorm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrf()"); #endif stat->ops[FACT] = 0.0; if ( Pr*Pc > 1 ) { i = Llu->bufmax[0]; if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lsub_buf."); Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; i = Llu->bufmax[1]; if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lval_buf[]."); Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; if ( Llu->bufmax[2] != 0 ) if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) ABORT("Malloc fails for Usub_buf[]."); if ( Llu->bufmax[3] != 0 ) if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) ABORT("Malloc fails for Uval_buf[]."); if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) ABORT("Malloc fails for send_req[]."); } if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) ) ABORT("Malloc fails for ujrow[]."); #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", Llu->bufmax[0], Llu->bufmax[1], Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); } #endif Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; Lval_buf_2[0] = Llu->Lval_buf_2[0]; Lval_buf_2[1] = Llu->Lval_buf_2[1]; Usub_buf = Llu->Usub_buf; Uval_buf = Llu->Uval_buf; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; ToRecv = Llu->ToRecv; ToSendD = Llu->ToSendD; ToSendR = Llu->ToSendR; ldt = sp_ienv_dist(3); /* Size of maximum supernode */ if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) ABORT("Calloc fails for tempv2d[]."); tempU2d = tempv2d + ldt*ldt; if ( !(indirect = intMalloc_dist(ldt)) ) ABORT("Malloc fails for indirect[]."); k = CEILING( nsupers, Pr ); /* Number of local block rows */ if ( !(iuip = intMalloc_dist(k)) ) ABORT("Malloc fails for iuip[]."); if ( !(ruip = intMalloc_dist(k)) ) ABORT("Malloc fails for ruip[]."); #if ( VAMPIR>=1 ) VT_symdef(1, "Send-L", "Comm"); VT_symdef(2, "Recv-L", "Comm"); VT_symdef(3, "Send-U", "Comm"); VT_symdef(4, "Recv-U", "Comm"); VT_symdef(5, "TRF2", "Factor"); VT_symdef(100, "Factor", "Factor"); VT_begin(100); VT_traceon(); #endif /* --------------------------------------------------------------- Handle the first block column separately to start the pipeline. --------------------------------------------------------------- */ if ( mycol == 0 ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif scp = &grid->rscp; /* The scope of process row. */ /* Process column *kcol* multicasts numeric values of L(:,k) to process rows. */ lsub = Lrowind_bc_ptr[0]; lusup = Lnzval_bc_ptr[0]; if ( lsub ) { msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; msgcnt[1] = lsub[1] * SuperSize( 0 ); } else { msgcnt[0] = msgcnt[1] = 0; } for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[0][pj] != EMPTY ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, &send_req[pj] ); MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, &send_req[pj+Pc] ); #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, 0, msgcnt[0], msgcnt[1], pj); #endif #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif } } /* for pj ... */ } else { /* Post immediate receives. */ if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, 0, scp->comm, &recv_req[0] ); MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, 1, scp->comm, &recv_req[1] ); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); #endif } } /* if mycol == 0 */ /* ------------------------------------------ MAIN LOOP: Loop through all block columns. ------------------------------------------ */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( mycol == kcol ) { lk = LBj( k, grid ); /* Local block number. */ for (pj = 0; pj < Pc; ++pj) { /* Wait for Isend to complete before using lsub/lusup. */ if ( ToSendR[lk][pj] != EMPTY ) { MPI_Wait( &send_req[pj], &status ); MPI_Wait( &send_req[pj+Pc], &status ); } } lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; } else { if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ scp = &grid->rscp; /* The scope of process row. */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(2); #endif /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[0]);*/ /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, (4*k)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[0], &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[1]);*/ /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, (4*k+1)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[1], &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); #if ( VAMPIR>=1 ) VT_end(2); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", iam, k, msgcnt[0], msgcnt[1], kcol); fflush(stdout); #endif lsub = Lsub_buf_2[k%2]; lusup = Lval_buf_2[k%2]; #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[0] ) ++zero_msg; #endif } else msgcnt[0] = 0; } /* if mycol = Pc(k) */ scp = &grid->cscp; /* The scope of process column. */ if ( myrow == krow ) { /* Parallel triangular solve across process row *krow* -- U(k,j) = L(k,k) \ A(k,j). */ #ifdef _CRAY pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); #else pdgstrs2(n, k, Glu_persist, grid, Llu, stat); #endif /* Multicasts U(k,:) to process columns. */ lk = LBi( k, grid ); usub = Ufstnz_br_ptr[lk]; uval = Unzval_br_ptr[lk]; if ( usub ) { msgcnt[2] = usub[2]; msgcnt[3] = usub[1]; } else { msgcnt[2] = msgcnt[3] = 0; } if ( ToSendD[lk] == YES ) { for (pi = 0; pi < Pr; ++pi) { if ( pi != myrow ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(3); #endif MPI_Send( usub, msgcnt[2], mpi_int_t, pi, (4*k+2)%NTAGS, scp->comm); MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, (4*k+3)%NTAGS, scp->comm); #if ( VAMPIR>=1 ) VT_end(3); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); #endif } /* if pi ... */ } /* for pi ... */ } /* if ToSendD ... */ } else { /* myrow != krow */ if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(4); #endif /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[2]);*/ MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, (4*k+2)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[3]);*/ MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, (4*k+3)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); #if ( VAMPIR>=1 ) VT_end(4); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif usub = Usub_buf; uval = Uval_buf; #if ( DEBUGlevel>=2 ) printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); #endif #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[2] ) ++zero_msg; #endif } else msgcnt[2] = 0; } /* if myrow == Pr(k) */ /* * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). * for (j = k+1; k < N; ++k) { * for (i = k+1; i < N; ++i) * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) * && L(i,k) != 0 && U(k,j) != 0 ) * A(i,j) = A(i,j) - L(i,k) * U(k,j); */ msg0 = msgcnt[0]; msg2 = msgcnt[2]; if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ nsupr = lsub[1]; /* LDA of lusup. */ if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; luptr0 = knsupc; nlb = lsub[0] - 1; } else { lptr0 = BC_HEADER; luptr0 = 0; nlb = lsub[0]; } lptr = lptr0; for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ ib = lsub[lptr]; lib = LBi( ib, grid ); iuip[lib] = BR_HEADER; ruip[lib] = 0; lptr += LB_DESCRIPTOR + lsub[lptr+1]; } nub = usub[0]; /* Number of blocks in the block row U(k,:) */ iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ rukp = 0; /* Pointer to nzval[] of U(k,:) */ klst = FstBlockC( k+1 ); /* --------------------------------------------------- Update the first block column A(:,k+1). --------------------------------------------------- */ jb = usub[iukp]; /* Global block number of block U(k,j). */ if ( jb == k+1 ) { /* First update (k+1)-th block. */ --nub; lptr = lptr0; luptr = luptr0; ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #else hypre_F90_NAME_BLAS(dgemm,DGEMM)("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0, it = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[it++]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (it = 0, i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[it++]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* if jb == k+1 */ } /* if L(:,k) and U(k,:) not empty */ if ( k+1 < nsupers ) { kcol = PCOL( k+1, grid ); if ( mycol == kcol ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif /* Factor diagonal and subdiagonal blocks and test for exact singularity. */ pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif /* Process column *kcol+1* multicasts numeric values of L(:,k+1) to process rows. */ lk = LBj( k+1, grid ); /* Local block number. */ lsub1 = Lrowind_bc_ptr[lk]; if ( lsub1 ) { msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; msgcnt[1] = lsub1[1] * SuperSize( k+1 ); } else { msgcnt[0] = 0; msgcnt[1] = 0; } scp = &grid->rscp; /* The scope of process row. */ for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[lk][pj] != EMPTY ) { lusup1 = Lnzval_bc_ptr[lk]; #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, k+1, msgcnt[0], msgcnt[1], pj); #endif } } /* for pj ... */ } else { /* Post Recv of block column L(:,k+1). */ if ( ToRecv[k+1] >= 1 ) { scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); #endif } } /* if mycol == Pc(k+1) */ } /* if k+1 < nsupers */ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ /* --------------------------------------------------- Update all other blocks using block row U(k,:) --------------------------------------------------- */ for (j = 0; j < nub; ++j) { lptr = lptr0; luptr = luptr0; jb = usub[iukp]; /* Global block number of block U(k,j). */ ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #else hypre_F90_NAME_BLAS(dgemm,DGEMM)("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } /* Skip descriptor. Now point to fstnz index of block U(i,j). */ iuip[lib] += UB_DESCRIPTOR; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0 ; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[i]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted for the L blocks. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[i]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* for j ... */ } /* if k L(:,k) and U(k,:) are not empty */ } /* ------------------------------------------ END MAIN LOOP: for k = ... ------------------------------------------ */ #if ( VAMPIR>=1 ) VT_end(100); VT_traceoff(); #endif if ( Pr*Pc > 1 ) { SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); SUPERLU_FREE(send_req); } SUPERLU_FREE(Llu->ujrow); SUPERLU_FREE(tempv2d); SUPERLU_FREE(indirect); SUPERLU_FREE(iuip); SUPERLU_FREE(ruip); /* Prepare error message. */ if ( *info == 0 ) *info = n + 1; #if ( PROFlevel>=1 ) TIC(t1); #endif MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; { float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; MPI_Reduce( &msg_cnt, &msg_cnt_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_cnt, &msg_cnt_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); if ( !iam ) { printf("\tPDGSTRF comm stat:" "\tAvg\tMax\t\tAvg\tMax\n" "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", msg_cnt_sum/Pr/Pc, msg_cnt_max, msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); } } #endif if ( iinfo == n + 1 ) *info = 0; else *info = iinfo; #if ( PRNTlevel==3 ) MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # total msg\t%d\n", iinfo); #endif #if ( DEBUGlevel>=2 ) for (i = 0; i < Pr * Pc; ++i) { if ( iam == i ) { dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); printf("(%d)\n", iam); PrintInt10("Recv", nsupers, Llu->ToRecv); } MPI_Barrier( grid->comm ); } #endif #if ( DEBUGlevel>=3 ) printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgstrf()"); #endif } /* PDGSTRF */
Foam::label Foam::UIPstream::read ( const commsTypes commsType, const int fromProcNo, char* buf, const std::streamsize bufSize, const int tag ) { if (debug) { Pout<< "UIPstream::read : starting read from:" << fromProcNo << " tag:" << tag << " wanted size:" << label(bufSize) << " commsType:" << UPstream::commsTypeNames[commsType] << Foam::endl; } if (commsType == blocking || commsType == scheduled) { MPI_Status status; if ( MPI_Recv ( buf, bufSize, MPI_PACKED, procID(fromProcNo), tag, MPI_COMM_WORLD, &status ) ) { FatalErrorIn ( "UIPstream::read" "(const int fromProcNo, char* buf, std::streamsize bufSize)" ) << "MPI_Recv cannot receive incomming message" << Foam::abort(FatalError); return 0; } // Check size of message read label messageSize; MPI_Get_count(&status, MPI_BYTE, &messageSize); if (debug) { Pout<< "UIPstream::read : finished read from:" << fromProcNo << " tag:" << tag << " read size:" << label(bufSize) << " commsType:" << UPstream::commsTypeNames[commsType] << Foam::endl; } if (messageSize > bufSize) { FatalErrorIn ( "UIPstream::read" "(const int fromProcNo, char* buf, std::streamsize bufSize)" ) << "buffer (" << label(bufSize) << ") not large enough for incomming message (" << messageSize << ')' << Foam::abort(FatalError); } return messageSize; } else if (commsType == nonBlocking) { MPI_Request request; if ( MPI_Irecv ( buf, bufSize, MPI_PACKED, procID(fromProcNo), tag, MPI_COMM_WORLD, &request ) ) { FatalErrorIn ( "UIPstream::read" "(const int fromProcNo, char* buf, std::streamsize bufSize)" ) << "MPI_Recv cannot start non-blocking receive" << Foam::abort(FatalError); return 0; } if (debug) { Pout<< "UIPstream::read : started read from:" << fromProcNo << " tag:" << tag << " read size:" << label(bufSize) << " commsType:" << UPstream::commsTypeNames[commsType] << " request:" << PstreamGlobals::outstandingRequests_.size() << Foam::endl; } PstreamGlobals::outstandingRequests_.append(request); // Assume the message is completely received. return bufSize; } else { FatalErrorIn ( "UIPstream::read" "(const int fromProcNo, char* buf, std::streamsize bufSize)" ) << "Unsupported communications type " << commsType << Foam::abort(FatalError); return 0; } }
void peano::integration::dataqueries::CartesianGridWriterProxy::receiveQueryData( int dataTag, int source, tarch::plotter::griddata::regular::CartesianGridWriter::VertexDataWriter& vertexDataWriter, int recordsPerEntry ) { int flag = 0; MPI_Status status; while(!flag){ int result = MPI_Iprobe( source, dataTag, tarch::parallel::Node::getInstance().getCommunicator(), &flag, &status ); if (result!=MPI_SUCCESS) { logError("receiveQueryData()", "probing for messages on node "); } } int messages = 0; int datasets = 0; MPI_Get_count(&status, MPI_DOUBLE, &messages); assertionEquals( messages % 3, 0 ); datasets = messages/3; std::vector<double> positionList(messages); double* data = new double [datasets*recordsPerEntry]; MPI_Recv(&positionList[0], messages, MPI_DOUBLE, source, dataTag, tarch::parallel::Node::getInstance().getCommunicator(), &status); MPI_Recv( data, datasets*recordsPerEntry, MPI_DOUBLE, source, dataTag, tarch::parallel::Node::getInstance().getCommunicator(), &status); for (int i=0; i<datasets; i++) { if(recordsPerEntry==2){ tarch::la::Vector<2,double> data_local(0.0); for (int k=0; k<recordsPerEntry; k++) { data_local(k) = data[i*recordsPerEntry+k]; } tarch::la::Vector<3 ,double> v(0.0); v[0]=positionList[i*3]; v[1]=positionList[i*3+1]; v[2]=positionList[i*3+2]; vertexDataWriter.plotVertex( vertexDataWriter.getVertexIndex(v), data_local ); }else if(recordsPerEntry==3){ tarch::la::Vector<3,double> data_local(0.0); for (int k=0; k<recordsPerEntry; k++) { data_local(k) = data[i*recordsPerEntry+k]; } tarch::la::Vector<3 ,double> v(0.0); v[0]=positionList[i*3]; v[1]=positionList[i*3+1]; v[2]=positionList[i*3+2]; vertexDataWriter.plotVertex( vertexDataWriter.getVertexIndex(v), data_local ); }else{ double dataLocal=(0.0); for (int k=0; k<recordsPerEntry; k++) { dataLocal = data[i*recordsPerEntry+k]; } tarch::la::Vector<3 ,double> v(0.0); v[0]=positionList[i*3]; v[1]=positionList[i*3+1]; v[2]=positionList[i*3+2]; vertexDataWriter.plotVertex( vertexDataWriter.getVertexIndex(v), dataLocal ); } } delete[] data; }
Foam::UIPstream::UIPstream ( const commsTypes commsType, const int fromProcNo, DynamicList<char>& externalBuf, label& externalBufPosition, const int tag, const bool clearAtEnd, streamFormat format, versionNumber version ) : UPstream(commsType), Istream(format, version), fromProcNo_(fromProcNo), externalBuf_(externalBuf), externalBufPosition_(externalBufPosition), tag_(tag), clearAtEnd_(clearAtEnd), messageSize_(0) { setOpened(); setGood(); if (commsType == UPstream::nonBlocking) { // Message is already received into externalBuf } else { MPI_Status status; label wantedSize = externalBuf_.capacity(); if (debug) { Pout<< "UIPstream::UIPstream : read from:" << fromProcNo << " tag:" << tag << " wanted size:" << wantedSize << Foam::endl; } // If the buffer size is not specified, probe the incomming message // and set it if (!wantedSize) { MPI_Probe(procID(fromProcNo_), tag_, MPI_COMM_WORLD, &status); MPI_Get_count(&status, MPI_BYTE, &messageSize_); externalBuf_.setCapacity(messageSize_); wantedSize = messageSize_; if (debug) { Pout<< "UIPstream::UIPstream : probed size:" << wantedSize << Foam::endl; } } messageSize_ = UIPstream::read ( commsType, fromProcNo_, externalBuf_.begin(), wantedSize, tag_ ); // Set addressed size. Leave actual allocated memory intact. externalBuf_.setSize(messageSize_); if (!messageSize_) { FatalErrorIn ( "UIPstream::UIPstream(const commsTypes, const int, " "DynamicList<char>&, streamFormat, versionNumber)" ) << "read failed" << Foam::abort(FatalError); } } }
void KWayNodeRefine(CtrlType *ctrl, GraphType *graph, WorkSpaceType *wspace, int npasses, float ubfraction) { int i, ii, iii, j, k, pass, nvtxs, firstvtx, lastvtx, otherlastvtx, c, nmoves, nlupd, nsupd, nnbrs, nchanged, nsep; int npes = ctrl->npes, mype = ctrl->mype, nparts = ctrl->nparts; idxtype *xadj, *adjncy, *adjwgt, *vtxdist, *vwgt; idxtype *where, *lpwgts, *gpwgts, *sepind; idxtype *peind, *recvptr, *sendptr; idxtype *update, *supdate, *rupdate, *pe_updates, *htable, *changed; idxtype *badminpwgt, *badmaxpwgt; KeyValueType *swchanges, *rwchanges; int *nupds_pe; NRInfoType *rinfo, *myrinfo; int from, to, me, other, oldcut; IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->KWayTmr)); nvtxs = graph->nvtxs; vtxdist = graph->vtxdist; xadj = graph->xadj; adjncy = graph->adjncy; adjwgt = graph->adjwgt; vwgt = graph->vwgt; firstvtx = vtxdist[mype]; lastvtx = vtxdist[mype+1]; where = graph->where; rinfo = graph->nrinfo; lpwgts = graph->lpwgts; gpwgts = graph->gpwgts; nsep = graph->nsep; sepind = graph->sepind; nnbrs = graph->nnbrs; peind = graph->peind; recvptr = graph->recvptr; sendptr = graph->sendptr; changed = idxmalloc(nvtxs, "KWayRefine: changed"); rwchanges = wspace->pairs; swchanges = rwchanges + recvptr[nnbrs]; update = idxmalloc(nvtxs, "KWayRefine: update"); supdate = wspace->indices; rupdate = supdate + recvptr[nnbrs]; nupds_pe = imalloc(npes, "KWayRefine: nupds_pe"); htable = idxsmalloc(nvtxs+graph->nrecv, 0, "KWayRefine: lhtable"); badminpwgt = wspace->pv1; badmaxpwgt = wspace->pv2; for (i=0; i<nparts; i+=2) { badminpwgt[i] = badminpwgt[i+1] = (1.0/ubfraction)*(gpwgts[i]+gpwgts[i+1])/2; badmaxpwgt[i] = badmaxpwgt[i+1] = ubfraction*(gpwgts[i]+gpwgts[i+1])/2; } //myprintf(ctrl, "%6d %6d %6d %6d %6d %6d %6d\n", lpwgts[0], lpwgts[1], lpwgts[2], gpwgts[0], gpwgts[1], gpwgts[2], badmaxpwgt[0]); IFSET(ctrl->dbglvl, DBG_REFINEINFO, PrintNodeBalanceInfo(ctrl, nparts, gpwgts, badminpwgt, badmaxpwgt, 1)); for (pass=0; pass<npasses; pass++) { oldcut = graph->mincut; for (c=0; c<2; c++) { for (i=0; i<nparts; i+=2) { badminpwgt[i] = badminpwgt[i+1] = (1.0/ubfraction)*(gpwgts[i]+gpwgts[i+1])/2; badmaxpwgt[i] = badmaxpwgt[i+1] = ubfraction*(gpwgts[i]+gpwgts[i+1])/2; } nlupd = nsupd = nmoves = nchanged = 0; for (ii=0; ii<nsep; ii++) { i = sepind[ii]; from = where[i]; ASSERT(ctrl, from >= nparts); /* Go through the loop to see if gain is possible for the separator vertex */ if (rinfo[i].edegrees[(c+1)%2] <= vwgt[i]) { /* It is a one-sded move so it will go to the other partition. Look at the comments in InitMultisection to understand the meaning of from%nparts */ to = from%nparts+c; if (gpwgts[to]+vwgt[i] > badmaxpwgt[to]) { /* printf("Skip because of weight! %d\n", vwgt[i]-rinfo[i].edegrees[(c+1)%2]); */ continue; /* We cannot move it there because it gets too heavy */ } /* Update the where information of the vertex you moved */ where[i] = to; /* Remove this vertex from the sepind. Note the trick for looking at the sepind[ii] again */ sepind[ii--] = sepind[--nsep]; /* myprintf(ctrl, "Vertex %d [%d %d] is moving to %d from %d [%d]\n", i+firstvtx, vwgt[i], rinfo[i].edegrees[(c+1)%2], to, from, where[i]); */ lpwgts[from] -= vwgt[i]; lpwgts[2*nparts-1] -= vwgt[i]; lpwgts[to] += vwgt[i]; gpwgts[to] += vwgt[i]; /* Put the vertices adjacent to i that belong to either the separator or the (c+1)%2 partition into the update array */ for (j=xadj[i]; j<xadj[i+1]; j++) { k = adjncy[j]; if (htable[k] == 0 && where[k] != to) { htable[k] = 1; if (k<nvtxs) update[nlupd++] = k; else supdate[nsupd++] = k; } } nmoves++; if (graph->pexadj[i+1]-graph->pexadj[i] > 0) changed[nchanged++] = i; } } /* myprintf(ctrl, "nmoves: %d, nlupd: %d, nsupd: %d\n", nmoves, nlupd, nsupd); */ /* Tell everybody interested what the new where[] info is for the interface vertices */ CommChangedInterfaceData(ctrl, graph, nchanged, changed, where, swchanges, rwchanges, wspace->pv4); IFSET(ctrl->dbglvl, DBG_RMOVEINFO, rprintf(ctrl, "\t[%d %d], [%d %d %d]\n", pass, c, GlobalSESum(ctrl, nmoves), GlobalSESum(ctrl, nsupd), GlobalSESum(ctrl, nlupd))); /*----------------------------------------------------------------------- / Time to communicate with processors to send the vertices whose degrees / need to be updated. /-----------------------------------------------------------------------*/ /* Issue the receives first */ for (i=0; i<nnbrs; i++) { MPI_Irecv((void *)(rupdate+sendptr[i]), sendptr[i+1]-sendptr[i], IDX_DATATYPE, peind[i], 1, ctrl->comm, ctrl->rreq+i); } /* Issue the sends next. This needs some preporcessing */ for (i=0; i<nsupd; i++) { htable[supdate[i]] = 0; supdate[i] = graph->imap[supdate[i]]; } iidxsort(nsupd, supdate); for (j=i=0; i<nnbrs; i++) { otherlastvtx = vtxdist[peind[i]+1]; for (k=j; k<nsupd && supdate[k] < otherlastvtx; k++); MPI_Isend((void *)(supdate+j), k-j, IDX_DATATYPE, peind[i], 1, ctrl->comm, ctrl->sreq+i); j = k; } /* OK, now get into the loop waiting for the send/recv operations to finish */ MPI_Waitall(nnbrs, ctrl->rreq, ctrl->statuses); for (i=0; i<nnbrs; i++) MPI_Get_count(ctrl->statuses+i, IDX_DATATYPE, nupds_pe+i); MPI_Waitall(nnbrs, ctrl->sreq, ctrl->statuses); /*------------------------------------------------------------- / Place the received to-be updated vertices into update[] /-------------------------------------------------------------*/ for (i=0; i<nnbrs; i++) { pe_updates = rupdate+sendptr[i]; for (j=0; j<nupds_pe[i]; j++) { k = pe_updates[j]; if (htable[k-firstvtx] == 0) { htable[k-firstvtx] = 1; update[nlupd++] = k-firstvtx; } } } /*------------------------------------------------------------- / Update the where information of the vertices that are pulled / into the separator. /-------------------------------------------------------------*/ nchanged = 0; for (ii=0; ii<nlupd; ii++) { i = update[ii]; me = where[i]; if (me < nparts && me%2 == (c+1)%2) { /* This vertex is pulled into the separator */ lpwgts[me] -= vwgt[i]; where[i] = nparts+me-(me%2); sepind[nsep++] = i; /* Put the vertex into the sepind array */ if (graph->pexadj[i+1]-graph->pexadj[i] > 0) changed[nchanged++] = i; lpwgts[where[i]] += vwgt[i]; lpwgts[2*nparts-1] += vwgt[i]; /* myprintf(ctrl, "Vertex %d moves into the separator from %d to %d\n", i+firstvtx, me, where[i]); */ } } /* Tell everybody interested what the new where[] info is for the interface vertices */ CommChangedInterfaceData(ctrl, graph, nchanged, changed, where, swchanges, rwchanges, wspace->pv4); /*------------------------------------------------------------- / Update the rinfo of the vertices in the update[] array /-------------------------------------------------------------*/ for (ii=0; ii<nlupd; ii++) { i = update[ii]; ASSERT(ctrl, htable[i] == 1); htable[i] = 0; me = where[i]; if (me >= nparts) { /* If it is a separator vertex */ /* myprintf(ctrl, "Updating %d %d\n", i+firstvtx, me); */ myrinfo = rinfo+i; myrinfo->edegrees[0] = myrinfo->edegrees[1] = 0; for (j=xadj[i]; j<xadj[i+1]; j++) { other = where[adjncy[j]]; if (me != other) myrinfo->edegrees[other%2] += vwgt[adjncy[j]]; } } } /* Finally, sum-up the partition weights */ MPI_Allreduce((void *)lpwgts, (void *)gpwgts, 2*nparts, IDX_DATATYPE, MPI_SUM, ctrl->comm); graph->mincut = gpwgts[2*nparts-1]; IFSET(ctrl->dbglvl, DBG_REFINEINFO, PrintNodeBalanceInfo(ctrl, nparts, gpwgts, badminpwgt, badmaxpwgt, 0)); } if (graph->mincut == oldcut) break; } GKfree((void **)&update, &nupds_pe, &htable, &changed, LTERM); IFSET(ctrl->dbglvl, DBG_TIME, stoptimer(ctrl->KWayTmr)); }
void _HandlePrintRequests(int quiet_mode) { int outbufflen; char* outbuffer; char* fmtptr; int inbufflen; char* inbuffer; char* inbuffer2; FILE* outfile; int stop; MPI_Status status; int msgtag; int source; int size; int response = 1; int linenum; outbufflen = _PRINTFLEN*sizeof(char); outbuffer = (char *)_zmalloc(outbufflen,"mpi print buffer"); inbufflen = outbufflen; inbuffer = (char *)_zmalloc(inbufflen,"mpi in buffer"); inbuffer2 = (char *)_zmalloc(inbufflen,"mpi in buffer2"); stop = 0; while (!stop) { MPI_Probe(MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status); msgtag = status.MPI_TAG; source = status.MPI_SOURCE; size = MPI_Get_count(&status,MPI_BYTE,&size); switch (msgtag) { case _PRINTTAG: case _ERRPRINTTAG: MPI_Recv(outbuffer,outbufflen,MPI_CHAR,source,msgtag,MPI_COMM_WORLD, &status); if (msgtag == _PRINTTAG) { outfile = stdout; } else { outfile = stderr; } fprintf(outfile,"%s",outbuffer); fflush(outfile); MPI_Send(&response,1,MPI_INT,source,msgtag,MPI_COMM_WORLD); break; case _WRITETAG: case _ERRWRITETAG: if (msgtag == _WRITETAG) { outfile = stdout; } else { outfile = stderr; } if (size > outbufflen) { outbufflen = size; outbuffer = (char *)_zrealloc(outbuffer,outbufflen, ""); } MPI_Recv(outbuffer,outbufflen,MPI_BYTE,source,msgtag,MPI_COMM_WORLD, &status); fwrite(outbuffer,size,1,outfile); fflush(outfile); MPI_Send(&response,1,MPI_INT,source,msgtag,MPI_COMM_WORLD); break; case _SCANTAG: { int elemsize; MPI_Recv(outbuffer,outbufflen,MPI_BYTE,source,msgtag,MPI_COMM_WORLD, &status); elemsize = *(int*)outbuffer; fmtptr = (char*)(((int*)outbuffer)+1); outbuffer[size] = '\0'; /* printf("Console is scanning %d bytes using %s\n",elemsize,outbuffer);*/ scanf(fmtptr,inbuffer); /* printf("Got %d\n",*(int *)inbuffer);*/ MPI_Send(inbuffer,elemsize,MPI_BYTE,source,msgtag,MPI_COMM_WORLD); } break; case _SCAN2TAG: { int elemsize; int elemsize2; elemsize = *(int*)outbuffer; elemsize2 = *(((int*)outbuffer)+1); fmtptr = (char*)(((int*)outbuffer)+2); outbuffer[size] = '\0'; /* printf("Console is scanning %d bytes using %s\n",elemsize,outbuffer);*/ scanf(fmtptr,inbuffer,inbuffer2); /* printf("Got %d\n",*(int *)inbuffer);*/ memcpy(inbuffer+elemsize,inbuffer2,elemsize2); MPI_Send(inbuffer,elemsize+elemsize2,MPI_BYTE,source,msgtag, MPI_COMM_WORLD); } break; case _READTAG: MPI_Recv(&size,1,MPI_INT,source,msgtag,MPI_COMM_WORLD,&status); if (size > inbufflen) { inbufflen = size; inbuffer = (char *)_zrealloc(inbuffer,inbufflen, ""); } fread(inbuffer,size,1,stdin); MPI_Send(inbuffer,size,MPI_BYTE,source,msgtag,MPI_COMM_WORLD); break; case _DEADTAG: MPI_Recv(&source,1,MPI_INT,source,msgtag,MPI_COMM_WORLD,&status); fprintf(stderr,"Task %d died -- killing all others\n",source); _ZPL_halt(-1); stop=1; break; case _HALTTAG: MPI_Recv(&linenum,1,MPI_INT,source,msgtag,MPI_COMM_WORLD,&status); if ((quiet_mode <= 0) && (linenum != 0)) { fprintf(stderr,"halt at line %d reached\n",linenum); } _ZPL_halt(-1); stop=1; break; case _DONETAG: MPI_Recv(&response,1,MPI_INT,source,msgtag,MPI_COMM_WORLD,&status); MPI_Send(&response,1,MPI_INT,source,msgtag,MPI_COMM_WORLD); _PROCESSORS--; if (_PROCESSORS == 0) { stop = 1; } break; default: fprintf(stderr,"Unknown message type received by console: %d\n",msgtag); break; } } MPI_Barrier(MPI_COMM_WORLD); if (quiet_mode <= 0) { printf("All tasks terminated -- exiting\n"); } MPI_Finalize(); exit(0); }