void DiamondTop::freeRequests() { if(this->bufferMode == 1) MPI_Request_free(&reqs[0]); else if(this->bufferMode == 2) { MPI_Request_free(&reqs[0]); MPI_Request_free(&reqs[1]); } else for(int i=0;i<levels;i++) MPI_Request_free(&reqs[i]); //printf("All MPI_Requests Were Cleared!!\n"); }
int md_wrap_request_free(MPI_Request *request) /******************************************************************************* Machine dependent wrapped request object deletion routine. Author: Michael A. Heroux, SNL, 9214 ======= Return code: int ============ Parameter list: =============== request: Pointer to an existing request object that will be freed. *******************************************************************************/ { int err = 0; if (request != NULL) err = MPI_Request_free(request); return err; } /* md_wrap_request_free */
JNIEXPORT jlong JNICALL Java_mpi_Request_free( JNIEnv *env, jobject jthis, jlong handle) { MPI_Request req = (MPI_Request)handle; int rc = MPI_Request_free(&req); ompi_java_exceptionCheck(env, rc); return (jlong)req; }
~Request() { #ifdef HAVE_MPI // explicitly free this request if not // done so already, otherwise this would // be a memory leak! if (_request != MPI_REQUEST_NULL) MPI_Request_free(&_request); #endif }
int main( int argc, char *argv[] ) { MPI_Status status; MPI_Request request; int a[10], b[10]; int buf[BUFSIZE], *bptr, bl, i, j, rank, size; int errs = 0; MTest_Init( 0, 0 ); MPI_Comm_rank( MPI_COMM_WORLD, &rank ); MPI_Comm_size( MPI_COMM_WORLD, &size ); MPI_Buffer_attach( buf, BUFSIZE ); for (j=0; j<10; j++) { MPI_Bsend_init( a, 10, MPI_INT, 0, 27+j, MPI_COMM_WORLD, &request ); for (i=0; i<10; i++) { a[i] = (rank + 10 * j) * size + i; } MPI_Start( &request ); MPI_Wait( &request, &status ); MPI_Request_free( &request ); } if (rank == 0) { for (i=0; i<size; i++) { for (j=0; j<10; j++) { int k; status.MPI_TAG = -10; status.MPI_SOURCE = -20; MPI_Recv( b, 10, MPI_INT, i, 27+j, MPI_COMM_WORLD, &status ); if (status.MPI_TAG != 27+j) { errs++; printf( "Wrong tag = %d\n", status.MPI_TAG ); } if (status.MPI_SOURCE != i) { errs++; printf( "Wrong source = %d\n", status.MPI_SOURCE ); } for (k=0; k<10; k++) { if (b[k] != (i + 10 * j) * size + k) { errs++; printf( "received b[%d] = %d from %d tag %d\n", k, b[k], i, 27+j ); } } } } } MPI_Buffer_detach( &bptr, &bl ); MTest_Finalize( errs ); MPI_Finalize(); return 0; }
void amps_FreePackage(amps_Package package) { int i; if (package) { if (package->commited) { for (i = 0; i < package->num_recv; i++) { if (package->recv_invoices[i]->mpi_type != MPI_DATATYPE_NULL) { MPI_Type_free(&(package->recv_invoices[i]->mpi_type)); } MPI_Request_free(&package->recv_requests[i]); } for (i = 0; i < package->num_send; i++) { if (package->send_invoices[i]->mpi_type != MPI_DATATYPE_NULL) { MPI_Type_free(&package->send_invoices[i]->mpi_type); } MPI_Request_free(&package->send_requests[i]); } if (package->num_send + package->num_recv) { free(package->recv_requests); free(package->status); } package->commited = FALSE; } free(package); } }
/* * This example causes the IBM SP2 MPI version to generate the message * ERROR: 0032-158 Persistent request already active (2) in MPI_Startall, task 0 * in the SECOND set of MPI_Startall (after the MPI_Request_free). */ int main( int argc, char **argv ) { MPI_Request r[4]; MPI_Status statuses[4]; double sbuf1[10], sbuf2[10]; double rbuf1[10], rbuf2[10]; int size, rank, up_nbr, down_nbr, i; MPI_Init( &argc, &argv ); MPI_Comm_size( MPI_COMM_WORLD, &size ); MPI_Comm_rank( MPI_COMM_WORLD, &rank ); up_nbr = (rank + 1) % size; down_nbr = (size + rank - 1) % size; MPI_Recv_init( rbuf1, 10, MPI_DOUBLE, down_nbr, 0, MPI_COMM_WORLD, &r[0] ); MPI_Recv_init( rbuf2, 10, MPI_DOUBLE, up_nbr, 1, MPI_COMM_WORLD, &r[1] ); MPI_Send_init( sbuf1, 10, MPI_DOUBLE, up_nbr, 0, MPI_COMM_WORLD, &r[2] ); MPI_Send_init( sbuf2, 10, MPI_DOUBLE, down_nbr, 1, MPI_COMM_WORLD, &r[3] ); MPI_Startall( 4, r ); MPI_Waitall( 4, r, statuses ); for (i=0; i<4; i++) { MPI_Request_free( &r[i] ); } MPI_Recv_init( rbuf1, 10, MPI_DOUBLE, down_nbr, 0, MPI_COMM_WORLD, &r[0] ); MPI_Recv_init( rbuf2, 10, MPI_DOUBLE, up_nbr, 1, MPI_COMM_WORLD, &r[1] ); MPI_Send_init( sbuf1, 10, MPI_DOUBLE, up_nbr, 0, MPI_COMM_WORLD, &r[2] ); MPI_Send_init( sbuf2, 10, MPI_DOUBLE, down_nbr, 1, MPI_COMM_WORLD, &r[3] ); MPI_Startall( 4, r ); MPI_Waitall( 4, r, statuses ); for (i=0; i<4; i++) { MPI_Request_free( &r[i] ); } if (rank == 0) printf( "No errors\n" ); MPI_Finalize(); return 0; }
void mpi_request_free_f(MPI_Fint *request, MPI_Fint *ierr) { int err; MPI_Request c_req = MPI_Request_f2c( *request ); err = MPI_Request_free(&c_req); *ierr = OMPI_INT_2_FINT(err); if (MPI_SUCCESS == err) { *request = OMPI_INT_2_FINT(MPI_REQUEST_NULL->req_f_to_c_index); } }
// Register a wakeup callback for the communication thread void flow_t::post_wakeup(line_details_t& line, const wakeup_block_t b) { #if PENTAGO_MPI_FUNNEL requests.add_immediate(curry(&flow_t::wakeup,this,&line,b)); #else static_assert(sizeof(line_details_t*)==sizeof(long long int),""); // Send a pointer to ourselves to the communication thread MPI_Request request; CHECK(MPI_Isend((void*)&line.self,1,MPI_LONG_LONG_INT,0,wakeup_tag(b),comms.wakeup_comm,&request)); // Since requests_t::free is not thread safe, we're forced to use MPI_Request_free here. // This is bad, because http://blogs.cisco.com/performance/mpi_request_free-is-evil. CHECK(MPI_Request_free(&request)); #endif }
void _amps_wait_exchange(amps_Handle handle) { int notdone; int i; MPI_Status *status; if(handle -> package -> num_recv + handle -> package -> num_send) { status = (MPI_Status*)calloc((handle -> package -> num_recv + handle -> package -> num_send), sizeof(MPI_Status)); MPI_Waitall(handle -> package -> num_recv + handle -> package -> num_send, handle -> package -> requests, status); free(status); for(i = 0; i < handle -> package -> num_recv; i++) { if( handle -> package -> recv_invoices[i] -> mpi_type != MPI_DATATYPE_NULL ) { MPI_Type_free(&(handle -> package -> recv_invoices[i] -> mpi_type)); } MPI_Request_free(&handle -> package -> requests[i]); } for(i = 0; i < handle -> package -> num_send; i++) { if( handle -> package -> send_invoices[i] -> mpi_type != MPI_DATATYPE_NULL ) { MPI_Type_free(&handle -> package -> send_invoices[i] -> mpi_type); } MPI_Request_free(&handle -> package -> requests[handle -> package -> num_recv + i]); } } }
void thorium_mpi1_request_destroy(struct thorium_mpi1_request *self) { if (self->request != MPI_REQUEST_NULL) { MPI_Request_free(&self->request); self->request = MPI_REQUEST_NULL; } self->source = -1; self->tag = -1; self->buffer = NULL; self->worker = -1; self->count = -1; }
/*! Cancels the receive associated to the specified rank. \param rank is the rank associated to the receive to cancel */ void DataCommunicator::cancelRecv(int rank) { if (m_recvIds.count(rank) == 0) { return; } int id = m_recvIds[rank]; if (m_recvRequests[id] == MPI_REQUEST_NULL) { return; } MPI_Cancel(&m_recvRequests[id]); MPI_Request_free(&m_recvRequests[id]); }
void memheap_oob_destruct(void) { int i; oob_comm_request_t *r; opal_progress_unregister(oshmem_mkey_recv_cb); for (i = 0; i < MEMHEAP_RECV_REQS_MAX; i++) { r = &memheap_oob.req_pool[i]; MPI_Cancel(&r->recv_req); MPI_Request_free(&r->recv_req); } OBJ_DESTRUCT(&memheap_oob.req_list); OBJ_DESTRUCT(&memheap_oob.lck); OBJ_DESTRUCT(&memheap_oob.cond); }
int SAMRAI_MPI::Request_free( Request* request) { #ifndef HAVE_MPI NULL_USE(request); #endif int rval = MPI_SUCCESS; if (!s_mpi_is_initialized) { TBOX_ERROR("SAMRAI_MPI::Get_count is a no-op without run-time MPI!"); } #ifdef HAVE_MPI else { rval = MPI_Request_free(request); } #endif return rval; }
EXPORT_MPI_API void FORTRAN_API mpi_request_free_( MPI_Fint *request, MPI_Fint *__ierr ) { MPI_Request lrequest = MPI_Request_f2c(*request); *__ierr = MPI_Request_free( &lrequest ); #ifdef OLD_POINTER /* We actually need to remove the pointer from the mapping if the ref count is zero. We do that by checking to see if lrequest was set to NULL. */ if (!lrequest) { MPIR_RmPointer( *(int*)request ); *(int*)request = 0; } #endif *request = MPI_Request_c2f(lrequest); }
/** * @brief Responds with no_work to pending work requests. * * Answers any pending work requests in case a rank is blocking, * waiting for a response. */ static void CIRCLE_cleanup_mpi_messages(CIRCLE_state_st* sptr) { int i = 0; int j = 0; /* TODO: this is O(N^2)... need a better way at large scale */ /* Make sure that all pending work requests are answered. */ for(j = 0; j < sptr->size; j++) { for(i = 0; i < sptr->size; i++) { if(i != sptr->rank) { sptr->request_flag[i] = 0; if(MPI_Test(&sptr->mpi_state_st->request_request[i], \ &sptr->request_flag[i], \ &sptr->mpi_state_st->request_status[i]) \ != MPI_SUCCESS) { MPI_Abort(*sptr->mpi_state_st->work_comm, \ LIBCIRCLE_MPI_ERROR); } if(sptr->request_flag[i]) { MPI_Start(&sptr->mpi_state_st->request_request[i]); CIRCLE_send_no_work(i); } } } } /* free off persistent requests */ for(i = 0; i < sptr->size; i++) { if(i != sptr->rank) { MPI_Request_free(&sptr->mpi_state_st->request_request[i]); } } return; }
int main (int argc,char **argv) { MPI_Status status; int rank, size; struct { int value; int rank; } num, max, rcvd; MPI_Init(&argc,&argv); MPI_Comm_rank (MPI_COMM_WORLD,&rank); MPI_Comm_size (MPI_COMM_WORLD,&size); char *tracefile = getenv("TVTRACE"); if( tracefile != NULL ){ printf( "tv tracefile=%s\n", tracefile ); MPI_Pcontrol(TRACEFILES, NULL, tracefile, 0); } else{ MPI_Pcontrol(TRACEFILES, NULL, "trace", 0); } MPI_Pcontrol(TRACELEVEL, 1, 1, 1); MPI_Pcontrol(TRACENODE, 1000000, 1, 1); num.value = my_random(rank); num.rank = rank; printf("Node %d: value = %d\n", num.rank, num.value); double sTime, eTime; sTime = MPI_Wtime(); MPI_Pcontrol(TRACEEVENT, "entry", 2, 0, ""); MPI_Reduce(&num, &max, 1, MPI_2INT, MPI_MAXLOC, 0, MPI_COMM_WORLD); MPI_Pcontrol(TRACEEVENT, "exit", 2, 0, ""); eTime = MPI_Wtime(); MPI_Barrier( MPI_COMM_WORLD ); MPI_Pcontrol(TRACEEVENT, "entry", 1, 0, ""); if (rank == 0) { print_result("MPI_Reduce", max.rank, max.value, eTime - sTime); sTime = MPI_Wtime(); max.value = num.value; max.rank = num.rank; int i; for(i = 1; i < size; i++) { MPI_Recv(&rcvd, 1, MPI_2INT, i, TAG, MPI_COMM_WORLD, &status); if (rcvd.value > max.value) { max.value = rcvd.value; max.rank = rcvd.rank; } } eTime = MPI_Wtime(); print_result("Send-receive", max.rank, max.value, eTime - sTime); } else { MPI_Ssend(&num, 1, MPI_2INT, 0, TAG, MPI_COMM_WORLD); } MPI_Pcontrol(TRACEEVENT, "exit", 1, 0, ""); #if 0 if( !rank ){ double *a,*b,*c, *c0; int i,i1,j,k; int ann; MPI_Status *st; MPI_Request *rq,rq1; rq = (MPI_Request*) malloc( (size-1)*sizeof(MPI_Request) ); st = (MPI_Status*) malloc( (size-1)*sizeof(MPI_Status) ); ann=an/size+((an%size)?1:0); // printf("[%d]ann=%d\n", rank, ann ); a=(double*) malloc(am*an*sizeof(double)); b=(double*) malloc(am*bm*sizeof(double)); c=(double*) malloc(an*bm*sizeof(double)); for(i=0;i<am*an;i++) a[i]=rand()%301; for(i=0;i<am*bm;i++) b[i]=rand()%251; printf( "Data ready [%d]\n", rank ); c0 = (double*)malloc(an*bm*sizeof(double)); time = MPI_Wtime(); for (i=0; i<an; i++) for (j=0; j<bm; j++) { double s = 0.0; for (k=0; k<am; k++) s+= a[i*am+k]*b[k*bm+j]; c0[i*bm+j] = s; } time = MPI_Wtime() - time; printf("Time seq[%d] = %lf\n", rank, time ); time_seq = time; MPI_Barrier( MPI_COMM_WORLD ); time=MPI_Wtime(); MPI_Bcast( b, am*bm, MPI_DOUBLE, 0, MPI_COMM_WORLD); printf( "Data Bcast [%d]\n", rank ); for( i1=0, j=1; j<size; j++, i1+=ann*am ){ printf( "Data to Send [%d] %016x[%4d] =>> %d\n", rank, a+i1, i1, j ); MPI_Isend( a+i1, ann*am, MPI_DOUBLE, j, 101, MPI_COMM_WORLD, &rq1 ); MPI_Request_free( &rq1 ); printf( "Data Send [%d] =>> %d\n", rank, j ); } printf( "Data Send [%d]\n", rank ); MPI_Isend( a+i1, 1, MPI_DOUBLE, 0, 101, MPI_COMM_WORLD, &rq1 ); MPI_Request_free( &rq1 ); printf( "Data Send [%d] =>> %d\n", rank, j ); for(i=(i1/am);i<an;i++) for(j=0;j<bm;j++){ double s=0.0; for(k=0;k<am;k++) s+=a[i*am+k]*b[k*bm+j]; c[i*bm+j]=s; } printf( "Job done [%d]\n", rank ); for( i1=0, j=1; j<size; j++, i1+=(ann*bm) ){ printf( "Data to Recv [%d] %016x[%4d] =>> %d\n", rank, c+i1, i1/bm, j ); MPI_Irecv( c+i1, ann*am, MPI_DOUBLE, j, 102, MPI_COMM_WORLD, rq+(j-1) ); } MPI_Waitall( size-1, rq, st ); time=MPI_Wtime()-time; printf("time [%d]=%12.8lf\n",rank,time); time_par = time; printf( "Data collected [%d]\n", rank ); time=MPI_Wtime(); int ok = 1; for(i=0;i<an*bm;i++) if( c[i] != c0[i] ){ ok = 0; printf( "Fail [%d %d] %lf != %lf\n", i/bm, i%bm, c[i], c0[i] ); break; } time=MPI_Wtime()-time; if( ok ){ printf( "Data verifeid [%d] time = %lf\n", rank, time ); printf( "SpeedUp S(%d) = %14.10lf\n", size, time_seq/time_par ); printf( "Efitncy E(%d) = %14.10lf\n", size, time_seq/(time_par*size) ); } } else { int ann; double *a,*b,*c; MPI_Status st; int i,j,k; MPI_Pcontrol(TRACEEVENT, "entry", 0, 0, ""); ann= an/size + ((an%size)?1:0); // if(rank==1) // printf("[%d]ann=%d = %d / %d \n", rank, ann, an, size ); a=(double*)malloc(ann*am*sizeof(double)); b=(double*)malloc(bm*am*sizeof(double)); c=(double*)malloc(ann*bm*sizeof(double)); printf( "Mem allocated [%d]\n", rank ); MPI_Barrier( MPI_COMM_WORLD ); MPI_Pcontrol(TRACEEVENT, "exit", 0, 0, ""); time = MPI_Wtime(); MPI_Pcontrol(TRACEEVENT, "entry", 1, 0, ""); MPI_Bcast(b,am*bm,MPI_DOUBLE,0,MPI_COMM_WORLD); printf( "Data Bcast [%d]\n", rank ); MPI_Recv( a, ann*am, MPI_DOUBLE, 0, 101, MPI_COMM_WORLD, &st); printf( "Data Recv [%d]\n", rank ); MPI_Pcontrol(TRACEEVENT, "exit", 1, 0, ""); MPI_Pcontrol(TRACEEVENT, "entry", 2, 0, ""); for( i=0; i<ann; i++ ) for(j=0;j<bm;j++){ double s=0.0; for( k=0; k<am; k++ ){ s+=a[i*am+k]*b[k*bm+j]; } /* if(1==rank){ if(0==j){ printf( "c[%d<%d %d] = %lf\n", i,ann,j, s ); } } */ c[i*bm+j]=s; } printf( "Job done [%d]\n", rank ); MPI_Pcontrol(TRACEEVENT, "exit", 2, 0, ""); MPI_Pcontrol(TRACEEVENT, "entry", 3, 0, ""); MPI_Send( c, ann*bm, MPI_DOUBLE, 0, 102, MPI_COMM_WORLD); printf( "Data returned [%d]\n", rank ); MPI_Pcontrol(TRACEEVENT, "exit", 3, 0, ""); time=MPI_Wtime()-time; printf("time [%d]=%12.8lf\n",rank,time); } #endif MPI_Finalize(); return 0; }
int main( int argc, char **argv ) { int rank, size, loop, max_loop = DEFAULT_LOOP, max_req = DEFAULT_REQ; int buf_len = DEFAULT_LEN; int i, j, errs = 0, toterrs; MPI_Request r; MPI_Status status; int *(b[MAX_REQ]); MPI_Datatype dtype; int sendrank = 0, recvrank = 1; MPI_Init( &argc, &argv ); dtype = MPI_INT; MPI_Comm_rank( MPI_COMM_WORLD, &rank ); /* The following test allows this test to run on small-memory systems that support the sysconf call interface. This test keeps the test from becoming swap-bound. For example, on an old Linux system or a Sony Playstation 2 (really!) */ #if defined(HAVE_SYSCONF) && defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) if (rank == sendrank) { long n_pages, pagesize; int msglen_max = max_req * buf_len * sizeof(int); n_pages = sysconf( _SC_PHYS_PAGES ); pagesize = sysconf( _SC_PAGESIZE ); /* printf( "Total mem = %ld\n", n_pages * pagesize ); */ /* We want to avoid integer overflow in the size calculation. The best way is to avoid computing any products (such as total memory = n_pages * pagesize) and instead compute a msglen_max that fits within 1/4 of the available pages */ if (n_pages > 0 && pagesize > 0) { /* Recompute msglen_max */ int msgpages = 4 * ((msglen_max + pagesize - 1)/ pagesize); while (n_pages < msgpages) { msglen_max /= 2; msgpages /= 2; buf_len /= 2; } } } #else /* printf( "No sysconf\n" ); */ #endif /* Check command line args (allow usage even with one processor */ argv++; argc--; while (argc--) { if (strcmp( "-loop" , *argv ) == 0) { argv++; argc--; max_loop = atoi( *argv ); } else if (strcmp( "-req", *argv ) == 0) { argv++; argc--; max_req = atoi( *argv ); } else if (strcmp( "-len", *argv ) == 0) { argv++; argc--; buf_len = atoi( *argv ); } else { fprintf( stdout, "Usage: reqfree [ -loop n ] [ -req n ] [ -len n ]\n" ); fflush(stdout); fprintf( stderr, "[%i] Aborting\n",rank );fflush(stderr); MPI_Abort( MPI_COMM_WORLD, 1 ); } argv++; } MPI_Comm_size( MPI_COMM_WORLD, &size ); /* if (size != 2) { fprintf( stderr, "This program requires two processes\n" ); MPI_Abort( MPI_COMM_WORLD, 1 ); }*/ /* Assume only processor 0 has the command line */ MPI_Bcast( &max_loop, 1, MPI_INT, 0, MPI_COMM_WORLD ); MPI_Bcast( &max_req, 1, MPI_INT, 0, MPI_COMM_WORLD ); MPI_Bcast( &buf_len, 1, MPI_INT, 0, MPI_COMM_WORLD ); if (rank <= 1) { /* Allocate buffers */ for (i=0; i<max_req; i++) { b[i] = (int *) malloc(buf_len * sizeof(int) ); if (!b[i]) { fprintf( stderr, "Could not allocate %dth block of %d ints\n", i, buf_len ); fprintf( stderr, "[%i] Aborting\n",rank );fflush(stderr); MPI_Abort( MPI_COMM_WORLD, 2 ); } if (rank != sendrank) break; for (j=0; j<buf_len; j++) { b[i][j] = i * buf_len + j; } } /* Loop several times to capture resource leaks */ for (loop=0; loop<max_loop; loop++) { if (rank == sendrank) { for (i=0; i<max_req; i++) { MPI_Isend( b[i], buf_len, dtype, recvrank, 0, MPI_COMM_WORLD, &r ); MPI_Request_free( &r ); } MPI_Barrier( MPI_COMM_WORLD ); MPI_Barrier( MPI_COMM_WORLD ); } else { MPI_Barrier( MPI_COMM_WORLD ); for (i=0; i<max_req; i++) { MPI_Recv( b[0], buf_len, dtype, sendrank, 0, MPI_COMM_WORLD, &status ); for (j=0; j<buf_len; j++) { if (b[0][j] != i * buf_len + j) { errs++; fprintf( stdout, "at %d in %dth message, got %d expected %d\n", j, i, b[0][j], i * buf_len + j ); break; } } } MPI_Barrier( MPI_COMM_WORLD ); } }//loop } else { /* more than two processes just do common barrier*/ for (loop=0; loop<max_loop; loop++) { MPI_Barrier( MPI_COMM_WORLD ); MPI_Barrier( MPI_COMM_WORLD ); } } MPI_Allreduce( &errs, &toterrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); if (rank == 0) { if (toterrs == 0) printf( " No Errors\n" ); else printf( "Found %d errors\n", toterrs ); } MPI_Finalize( ); return 0; }
int main(int argc, char *argv[]) { unsigned char str[154]; unsigned int arr[] = {9,2,5,8,4,2,4,1,6,9,1,8,9,9,6,1,5,7,0,7,7,4,3,7,6,3,9,5,4,2,3,0,4,4,1,5,3,3,7,2,3,3,7,0,9,4,5,2,8,4,6,\ 2,1,3,4,1,4,2,6,0,8,5,1,7,3,1,4,4,7,0,5,3,4,4,8,9,1,1,9,8,3,5,1,8,3,4,4,8,3,2,8,1,2,8,7,4,1,8,1,8,0,4,\ 8,4,2,4,4,5,4,9,1,8,3,4,9,5,6,3,3,1,4,6,4,1,0,2,0,2,5,1,4,8,5,9,9,6,9,4,0,3,6,5,5,9,5,4,2,2,3,7,8,5,9,7}; long i; double t1, t2, Itime; int provided; /* Allocation */ v1 = (vector *) malloc (VLEN * sizeof (vector)); v2 = (vector *) malloc (VLEN * sizeof (vector)); v3 = (vector *) malloc (VLEN * sizeof (vector)); fin_sum = (mp_limb_t *) malloc ((2*LIMBS+1) * sizeof (mp_limb_t)); result = (mp_limb_t *) malloc ((2*LIMBS+1) * sizeof (mp_limb_t)); q = (mp_limb_t *) malloc (LIMBS * sizeof (mp_limb_t)); MPI_Init_thread (&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank (MPI_COMM_WORLD, &id); MPI_Comm_size (MPI_COMM_WORLD, &p); MPI_Type_contiguous (2*LIMBS+1, MPI_UNSIGNED_LONG_LONG, &mpntype0); MPI_Type_commit (&mpntype0); MPI_Type_contiguous (LIMBS, MPI_UNSIGNED_LONG_LONG, &mpntype1); MPI_Type_commit (&mpntype1); MPI_Op_create ((MPI_User_function *)addmpn, 1, &mpn_sum); for (i=0; i<154; ++i) str[i] = (unsigned char)arr[i]; mpn_set_str (q, str, 154, 10); //if (!id) gmp_printf ("Modulus: %Nd\n", q, LIMBS); MPI_Barrier (MPI_COMM_WORLD); /* Setting limits for 2 MPI nodes */ VOffset = BLOCK_LOW(id,p,VLEN); VChunk = BLOCK_SIZE(id,p,VLEN); /* Setting limits for NCORES-1 threads */ for (i=0; i<NCORES-1; ++i) { VStart[i] = VOffset + BLOCK_LOW(i,NCORES-1,VChunk); VEnd[i] = VOffset + BLOCK_HIGH(i,NCORES-1,VChunk); } for (i=0; i<VLEN; ++i) mpn_random (v1[i], LIMBS); for (i=0; i<VLEN; ++i) mpn_random (v2[i], LIMBS); for (i=BLOCK_LOW(id,p,VLEN); i<=BLOCK_HIGH(id,p,VLEN); ++i) mpn_random (v3[i], LIMBS); MPI_Barrier (MPI_COMM_WORLD); t1 = MPI_Wtime (); for (i=0; i<NCORES; ++i) pthread_create(&threads[i], &attr, VectMul, (void *) i); for (i=0; i<NCORES; ++i) pthread_join (threads[i], NULL); t2 = MPI_Wtime (); Itime = t2 - t1; if (!id) printf ("Total time taken: %lf\n",Itime); if (!id) gmp_printf ("Result: %Nd\n", cnum, LIMBS); MPI_Op_free(&mpn_sum); MPI_Request_free (&Rrqst); MPI_Request_free (&Srqst); MPI_Finalize (); return 0; }
int main (int argc, char **argv) { int nprocs = -1; int rank = -1; int comm = MPI_COMM_WORLD; char processor_name[128]; int namelen = 128; int buf[BUF_SIZE * 2]; int i, j, k, index, outcount, flag; int indices[2]; MPI_Request aReq[2]; MPI_Status aStatus[2]; /* init */ MPI_Init (&argc, &argv); MPI_Comm_size (comm, &nprocs); MPI_Comm_rank (comm, &rank); MPI_Get_processor_name (processor_name, &namelen); printf ("(%d) is alive on %s\n", rank, processor_name); fflush (stdout); if (rank == 0) { /* set up persistent sends... */ MPI_Send_init (&buf[0], BUF_SIZE, MPI_INT, 1, 0, comm, &aReq[0]); MPI_Send_init (&buf[BUF_SIZE], BUF_SIZE, MPI_INT, 1, 1, comm, &aReq[1]); /* initialize the send buffers */ for (i = 0; i < BUF_SIZE; i++) { buf[i] = i; buf[BUF_SIZE + i] = BUF_SIZE - 1 - i; } } for (k = 0; k < 4; k++) { if (rank == 1) { /* zero out the receive buffers */ bzero (buf, sizeof(int) * BUF_SIZE * 2); } MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { /* start the persistent sends... */ if (k % 2) { MPI_Startall (2, &aReq[0]); } else { for (j = 0; j < 2; j++) { MPI_Start (&aReq[j]); } } /* complete the sends */ if (k < 2) { /* use MPI_Waitany */ for (j = 0; j < 2; j++) MPI_Waitany (2, aReq, &index, aStatus); } else { /* use MPI_Waitsome */ j = 0; while (j < 2) { MPI_Waitsome (2, aReq, &outcount, indices, aStatus); j += outcount; } } } else if (rank == 1) { /* set up receives for all of the sends */ for (j = 0; j < 2; j++) { MPI_Irecv (&buf[j * BUF_SIZE], BUF_SIZE, MPI_INT, 0, j, comm, &aReq[j]); } /* complete all of the receives... */ MPI_Waitall (2, aReq, aStatus); } } MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { /* free the persistent requests */ for (i = 0 ; i < 2; i++) { MPI_Request_free (&aReq[i]); } } MPI_Finalize (); printf ("(%d) Finished normally\n", rank); }
int main (int argc, char *argv[]) { MPI_Init (&argc, &argv); int nProc, iProc; MPI_Comm_rank (MPI_COMM_WORLD, &iProc); MPI_Comm_size (MPI_COMM_WORLD, &nProc); // number of threads const int NTHREADS = 6; // number of buffers const int NWAY = 2; // left neighbour const int left = LEFT(iProc, nProc); // right neighbour const int right = RIGHT(iProc, nProc); // allocate array of for local vector, left halo and right halo double* array = malloc (NWAY * (NTHREADS+2) * 2 * VLEN * sizeof (double)); ASSERT (array != 0); // initial buffer id int buffer_id = 0; // initialize data data_init (NTHREADS, iProc, buffer_id, array); omp_set_num_threads (NTHREADS); MPI_Barrier (MPI_COMM_WORLD); double time = -now(); #pragma omp parallel default (shared) firstprivate (buffer_id) { const int tid = omp_get_thread_num(); for (int k = 0; k < NITER; ++k) { for ( int i = 0; i < nProc * NTHREADS; ++i ) { const int slice_id = tid + 1; const int left_halo = 0; const int right_halo = NTHREADS+1; if (tid == 0) { MPI_Request send_req[2]; MPI_Request recv_req[2]; // post recv MPI_Irecv ( &array_ELEM_right (buffer_id, left_halo, 0), VLEN, MPI_DOUBLE , left, i, MPI_COMM_WORLD, &recv_req[0]); // post recv MPI_Irecv ( &array_ELEM_left (buffer_id, right_halo, 0), VLEN, MPI_DOUBLE , right, i, MPI_COMM_WORLD, &recv_req[1]); // issue send MPI_Isend ( &array_ELEM_right (buffer_id, right_halo - 1, 0), VLEN, MPI_DOUBLE , right, i, MPI_COMM_WORLD, &send_req[0]); // issue send MPI_Isend ( &array_ELEM_left (buffer_id, left_halo + 1, 0), VLEN, MPI_DOUBLE , left, i, MPI_COMM_WORLD, &send_req[1]); // free send request MPI_Request_free(&send_req[0]); MPI_Request_free(&send_req[1]); // wait for Irecv, Isend MPI_Waitall (2, recv_req, MPI_STATUSES_IGNORE); } #pragma omp barrier // compute data, read from id "buffer_id", write to id "1 - buffer_id" data_compute (NTHREADS, array, 1 - buffer_id, buffer_id, slice_id); #pragma omp barrier // alternate the buffer buffer_id = 1 - buffer_id; } } } time += now(); data_verify (NTHREADS, iProc, ( NITER * nProc * NTHREADS ) % NWAY, array); printf ("# mpi %s nProc %d vlen %i niter %d nthreads %i nway %i time %g\n" , argv[0], nProc, VLEN, NITER, NTHREADS, NWAY, time ); MPI_Finalize(); free (array); return EXIT_SUCCESS; }
HYPRE_Int hypre_MPI_Request_free( hypre_MPI_Request *request ) { return (HYPRE_Int) MPI_Request_free(request); }
void _XMP_reflect_pcopy_sched_dim(_XMP_array_t *adesc, int target_dim, int lwidth, int uwidth, int is_periodic, int shadow_comm_type){ if (lwidth == 0 && uwidth == 0) return; _XMP_array_info_t *ai = &(adesc->info[target_dim]); _XMP_array_info_t *ainfo = adesc->info; _XMP_ASSERT(ai->align_manner == _XMP_N_ALIGN_BLOCK); _XMP_ASSERT(ai->is_shadow_comm_member); if (lwidth > ai->shadow_size_lo || uwidth > ai->shadow_size_hi){ _XMP_fatal("reflect width is larger than shadow width."); } _XMP_reflect_sched_t *reflect = ai->reflect_sched; int target_tdim = ai->align_template_index; _XMP_nodes_info_t *ni = adesc->align_template->chunk[target_tdim].onto_nodes_info; if (ni->size == 1 && !is_periodic) return; int ndims = adesc->dim; // 0-origin int my_pos = ni->rank; int lb_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_lower); int ub_pos = _XMP_get_owner_pos(adesc, target_dim, ai->ser_upper); int lo_pos = (my_pos == lb_pos) ? ub_pos : my_pos - 1; int hi_pos = (my_pos == ub_pos) ? lb_pos : my_pos + 1; MPI_Comm *comm = adesc->align_template->onto_nodes->comm; int my_rank = adesc->align_template->onto_nodes->comm_rank; int lo_rank = my_rank + (lo_pos - my_pos) * ni->multiplier; int hi_rank = my_rank + (hi_pos - my_pos) * ni->multiplier; int count = 0, blocklength = 0; long long stride = 0; int type_size = adesc->type_size; void *array_addr = adesc->array_addr_p; void *lo_send_array = NULL, *lo_recv_array = NULL; void *hi_send_array = NULL, *hi_recv_array = NULL; void *lo_send_buf = NULL; void *lo_recv_buf = NULL; void *hi_send_buf = NULL; void *hi_recv_buf = NULL; int lo_buf_size = 0; int hi_buf_size = 0; if (reflect->prev_pcopy_sched_type && lwidth == reflect->lo_width && uwidth == reflect->hi_width && is_periodic == reflect->is_periodic){ if ((adesc->order == MPI_ORDER_FORTRAN && target_dim != ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim != 0)){ goto init_comm; } else if (reflect->prev_pcopy_sched_type != shadow_comm_type){ count = reflect->count; blocklength = reflect->blocklength; stride = reflect->stride; goto alloc_buf; } } // // setup data_type // if (adesc->order == MPI_ORDER_FORTRAN){ /* for XMP/F */ count = 1; blocklength = type_size; stride = ainfo[0].alloc_size * type_size; for (int i = ndims - 2; i >= target_dim; i--){ count *= ainfo[i+1].alloc_size; } for (int i = 1; i <= target_dim; i++){ blocklength *= ainfo[i-1].alloc_size; stride *= ainfo[i].alloc_size; } } else if (adesc->order == MPI_ORDER_C){ /* for XMP/C */ count = 1; blocklength = type_size; stride = ainfo[ndims-1].alloc_size * type_size; for (int i = 1; i <= target_dim; i++){ count *= ainfo[i-1].alloc_size; } for (int i = ndims - 2; i >= target_dim; i--){ blocklength *= ainfo[i+1].alloc_size; stride *= ainfo[i].alloc_size; } } else { _XMP_fatal("cannot determin the base language."); } // // calculate base address // alloc_buf: // for lower reflect if (lwidth){ lo_send_array = array_addr; lo_recv_array = array_addr; for (int i = 0; i < ndims; i++) { int lb_send, lb_recv; unsigned long long dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_upper - lwidth + 1; lb_recv = ainfo[i].shadow_size_lo - lwidth;; } else { // Note: including shadow area lb_send = 0; lb_recv = 0; } dim_acc = ainfo[i].dim_acc; lo_send_array = (void *)((char *)lo_send_array + lb_send * dim_acc * type_size); lo_recv_array = (void *)((char *)lo_recv_array + lb_recv * dim_acc * type_size); } } // for upper reflect if (uwidth){ hi_send_array = array_addr; hi_recv_array = array_addr; for (int i = 0; i < ndims; i++) { int lb_send, lb_recv; unsigned long long dim_acc; if (i == target_dim) { lb_send = ainfo[i].local_lower; lb_recv = ainfo[i].local_upper + 1; } else { // Note: including shadow area lb_send = 0; lb_recv = 0; } dim_acc = ainfo[i].dim_acc; hi_send_array = (void *)((char *)hi_send_array + lb_send * dim_acc * type_size); hi_recv_array = (void *)((char *)hi_recv_array + lb_recv * dim_acc * type_size); } } // // Allocate buffers // if (reflect->prev_pcopy_sched_type == _XMP_COMM_REFLECT && ((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim == 0))){ ; } else { _XMP_free(reflect->lo_send_buf); _XMP_free(reflect->lo_recv_buf); _XMP_free(reflect->hi_send_buf); _XMP_free(reflect->hi_recv_buf); } // for lower reflect if (lwidth){ lo_buf_size = lwidth * blocklength * count; if (shadow_comm_type == _XMP_COMM_REFLECT && ((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim == 0))){ lo_send_buf = lo_send_array; lo_recv_buf = lo_recv_array; } else { _XMP_TSTART(t0); lo_send_buf = _XMP_alloc(lo_buf_size); lo_recv_buf = _XMP_alloc(lo_buf_size); _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0); } } // for upper reflect if (uwidth){ hi_buf_size = uwidth * blocklength * count; if (shadow_comm_type == _XMP_COMM_REFLECT && ((adesc->order == MPI_ORDER_FORTRAN && target_dim == ndims - 1) || (adesc->order == MPI_ORDER_C && target_dim == 0))){ hi_send_buf = hi_send_array; hi_recv_buf = hi_recv_array; } else { _XMP_TSTART(t0); hi_send_buf = _XMP_alloc(hi_buf_size); hi_recv_buf = _XMP_alloc(hi_buf_size); _XMP_TEND2(xmptiming_.t_mem, xmptiming_.tdim_mem[target_dim], t0); } } // // cache schedule // reflect->count = count; reflect->blocklength = blocklength; reflect->stride = stride; reflect->lo_send_array = lo_send_array; reflect->lo_recv_array = lo_recv_array; reflect->hi_send_array = hi_send_array; reflect->hi_recv_array = hi_recv_array; reflect->lo_send_buf = lo_send_buf; reflect->lo_recv_buf = lo_recv_buf; reflect->hi_send_buf = hi_send_buf; reflect->hi_recv_buf = hi_recv_buf; // // initialize communication // int src, dst; init_comm: if (!is_periodic && my_pos == lb_pos){ // no periodic lo_rank = MPI_PROC_NULL; } if (!is_periodic && my_pos == ub_pos){ // no periodic hi_rank = MPI_PROC_NULL; } lo_buf_size = lwidth * reflect->blocklength * reflect->count; hi_buf_size = uwidth * reflect->blocklength * reflect->count; // for lower shadow if (lwidth){ src = lo_rank; dst = hi_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } if (shadow_comm_type == _XMP_COMM_REDUCE_SHADOW){ if (reflect->req_reduce[0] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[0]); } if (reflect->req_reduce[1] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[1]); } MPI_Send_init(reflect->lo_recv_buf, lo_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req_reduce[0]); MPI_Recv_init(reflect->lo_send_buf, lo_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req_reduce[1]); } else { if (reflect->req[0] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[0]); } if (reflect->req[1] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[1]); } MPI_Recv_init(reflect->lo_recv_buf, lo_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[0]); MPI_Send_init(reflect->lo_send_buf, lo_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_LO, *comm, &reflect->req[1]); } // for upper shadow if (uwidth){ src = hi_rank; dst = lo_rank; } else { src = MPI_PROC_NULL; dst = MPI_PROC_NULL; } if (shadow_comm_type == _XMP_COMM_REDUCE_SHADOW){ if (reflect->req_reduce[2] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[2]); } if (reflect->req_reduce[3] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req_reduce[3]); } MPI_Send_init(reflect->hi_recv_buf, hi_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req_reduce[2]); MPI_Recv_init(reflect->hi_send_buf, hi_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req_reduce[3]); } else { if (reflect->req[2] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[2]); } if (reflect->req[3] != MPI_REQUEST_NULL){ MPI_Request_free(&reflect->req[3]); } MPI_Recv_init(reflect->hi_recv_buf, hi_buf_size, MPI_BYTE, src, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[2]); MPI_Send_init(reflect->hi_send_buf, hi_buf_size, MPI_BYTE, dst, _XMP_N_MPI_TAG_REFLECT_HI, *comm, &reflect->req[3]); } reflect->prev_pcopy_sched_type = shadow_comm_type; reflect->lo_rank = lo_rank; reflect->hi_rank = hi_rank; }
dart_ret_t dart_waitall( dart_handle_t * handle, size_t n) { int i, r_n; int num_handles = (int)n; DART_LOG_DEBUG("dart_waitall()"); if (n == 0) { DART_LOG_ERROR("dart_waitall > number of handles = 0"); return DART_OK; } if (n > INT_MAX) { DART_LOG_ERROR("dart_waitall ! number of handles > INT_MAX"); return DART_ERR_INVAL; } DART_LOG_DEBUG("dart_waitall: number of handles: %d", num_handles); if (*handle) { MPI_Status *mpi_sta; MPI_Request *mpi_req; mpi_req = (MPI_Request *) malloc(num_handles * sizeof(MPI_Request)); mpi_sta = (MPI_Status *) malloc(num_handles * sizeof(MPI_Status)); /* * copy requests from DART handles to MPI request array: */ DART_LOG_TRACE("dart_waitall: copying DART handles to MPI request array"); r_n = 0; for (i = 0; i < num_handles; i++) { if (handle[i] != NULL) { DART_LOG_DEBUG("dart_waitall: -- handle[%d](%p): " "dest:%d win:%"PRIu64" req:%"PRIu64"", i, (void*)handle[i], handle[i]->dest, (uint64_t)handle[i]->win, (uint64_t)handle[i]->request); mpi_req[r_n] = handle[i]->request; r_n++; } } /* * wait for communication of MPI requests: */ DART_LOG_DEBUG("dart_waitall: MPI_Waitall, %d requests from %d handles", r_n, num_handles); /* From the MPI 3.1 standard: * * The i-th entry in array_of_statuses is set to the return * status of the i-th operation. Active persistent requests * are marked inactive. * Requests of any other type are deallocated and the * corresponding handles in the array are set to * MPI_REQUEST_NULL. * The list may contain null or inactive handles. * The call sets to empty the status of each such entry. */ if (r_n > 0) { if (MPI_Waitall(r_n, mpi_req, mpi_sta) == MPI_SUCCESS) { DART_LOG_DEBUG("dart_waitall: MPI_Waitall completed"); } else { DART_LOG_ERROR("dart_waitall: MPI_Waitall failed"); DART_LOG_TRACE("dart_waitall: free MPI_Request temporaries"); free(mpi_req); DART_LOG_TRACE("dart_waitall: free MPI_Status temporaries"); free(mpi_sta); return DART_ERR_INVAL; } } else { DART_LOG_DEBUG("dart_waitall > number of requests = 0"); return DART_OK; } /* * copy MPI requests back to DART handles: */ DART_LOG_TRACE("dart_waitall: copying MPI requests back to DART handles"); r_n = 0; for (i = 0; i < num_handles; i++) { if (handle[i]) { if (mpi_req[r_n] == MPI_REQUEST_NULL) { DART_LOG_TRACE("dart_waitall: -- mpi_req[%d] = MPI_REQUEST_NULL", r_n); } else { DART_LOG_TRACE("dart_waitall: -- mpi_req[%d] = %d", r_n, mpi_req[r_n]); } DART_LOG_TRACE("dart_waitall: -- mpi_sta[%d].MPI_SOURCE: %d", r_n, mpi_sta[r_n].MPI_SOURCE); DART_LOG_TRACE("dart_waitall: -- mpi_sta[%d].MPI_ERROR: %d:%s", r_n, mpi_sta[r_n].MPI_ERROR, DART__MPI__ERROR_STR(mpi_sta[r_n].MPI_ERROR)); handle[i]->request = mpi_req[r_n]; r_n++; } } /* * wait for completion of MPI requests at origins and targets: */ DART_LOG_DEBUG("dart_waitall: waiting for remote completion"); for (i = 0; i < num_handles; i++) { if (handle[i]) { if (handle[i]->request == MPI_REQUEST_NULL) { DART_LOG_TRACE("dart_waitall: -- handle[%d] done (MPI_REQUEST_NULL)", i); } else { DART_LOG_DEBUG("dart_waitall: -- MPI_Win_flush(handle[%d]: %p))", i, (void*)handle[i]); DART_LOG_TRACE("dart_waitall: handle[%d]->dest: %d", i, handle[i]->dest); DART_LOG_TRACE("dart_waitall: handle[%d]->win: %"PRIu64"", i, (uint64_t)handle[i]->win); DART_LOG_TRACE("dart_waitall: handle[%d]->req: %"PRIu64"", i, (uint64_t)handle[i]->request); /* * MPI_Win_flush to wait for remote completion: */ if (MPI_Win_flush(handle[i]->dest, handle[i]->win) != MPI_SUCCESS) { DART_LOG_ERROR("dart_waitall: MPI_Win_flush failed"); DART_LOG_TRACE("dart_waitall: free MPI_Request temporaries"); free(mpi_req); DART_LOG_TRACE("dart_waitall: free MPI_Status temporaries"); free(mpi_sta); return DART_ERR_INVAL; } DART_LOG_TRACE("dart_waitall: -- MPI_Request_free"); if (MPI_Request_free(&handle[i]->request) != MPI_SUCCESS) { DART_LOG_ERROR("dart_waitall: MPI_Request_free failed"); DART_LOG_TRACE("dart_waitall: free MPI_Request temporaries"); free(mpi_req); DART_LOG_TRACE("dart_waitall: free MPI_Status temporaries"); free(mpi_sta); return DART_ERR_INVAL; } } } } /* * free memory: */ DART_LOG_DEBUG("dart_waitall: free handles"); for (i = 0; i < num_handles; i++) { if (handle[i]) { /* Free handle resource */ DART_LOG_TRACE("dart_waitall: -- free handle[%d]: %p", i, (void*)(handle[i])); free(handle[i]); handle[i] = NULL; } } DART_LOG_TRACE("dart_waitall: free MPI_Request temporaries"); free(mpi_req); DART_LOG_TRACE("dart_waitall: free MPI_Status temporaries"); free(mpi_sta); } DART_LOG_DEBUG("dart_waitall > finished"); return DART_OK; }
void pzgstrs(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, doublecomplex *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * PZGSTRS solves a system of distributed linear equations * A*X = B with a general N-by-N matrix A using the LU factorization * computed by PZGSTRF. * If the equilibration, and row and column permutations were performed, * the LU factorization was performed for A1 where * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * and the linear system solved is * A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and * the permutation to B1 by Pc*Pr is applied internally in this routine. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * LUstruct (input) LUstruct_t* * The distributed data structures storing L and U factors. * The L and U factors are obtained from PZGSTRF for * the possibly scaled and permuted matrix A. * See superlu_zdefs.h for the definition of 'LUstruct_t'. * A may be scaled and permuted into A1, so that * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_defs.h for the definition of 'gridinfo_t'. * * B (input/output) doublecomplex* * On entry, the distributed right-hand side matrix of the possibly * equilibrated system. That is, B may be overwritten by diag(R)*B. * On exit, the distributed solution matrix Y of the possibly * equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X, * and X is the solution of the original system. * * m_loc (input) int (local) * The local row dimension of matrix B. * * fst_row (input) int (global) * The row number of B's first row in the global matrix. * * ldb (input) int (local) * The leading dimension of matrix B. * * nrhs (input) int (global) * Number of right-hand sides. * * SOLVEstruct (output) SOLVEstruct_t* (global) * Contains the information for the communication during the * solution phase. * * stat (output) SuperLUStat_t* * Record the statistics about the triangular solves. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PZGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/ lsum[il - LSUM_H].i = 0; } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = (*recvbuf).r; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], &tempv[i + j*knsupc]); } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PZGSTRS. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; } } }
dart_ret_t dart_waitall_local( dart_handle_t * handle, size_t num_handles) { size_t i, r_n = 0; DART_LOG_DEBUG("dart_waitall_local()"); if (num_handles == 0) { DART_LOG_DEBUG("dart_waitall_local > number of handles = 0"); return DART_OK; } if (num_handles > INT_MAX) { DART_LOG_ERROR("dart_waitall_local ! number of handles > INT_MAX"); return DART_ERR_INVAL; } if (*handle != NULL) { MPI_Status *mpi_sta; MPI_Request *mpi_req; mpi_req = (MPI_Request *) malloc(num_handles * sizeof(MPI_Request)); mpi_sta = (MPI_Status *) malloc(num_handles * sizeof(MPI_Status)); for (i = 0; i < num_handles; i++) { if (handle[i] != NULL) { DART_LOG_TRACE("dart_waitall_local: -- handle[%d]: %p)", i, (void*)handle[i]); DART_LOG_TRACE("dart_waitall_local: handle[%d]->dest: %d", i, handle[i]->dest); DART_LOG_TRACE("dart_waitall_local: handle[%d]->win: %d", i, handle[i]->win); DART_LOG_TRACE("dart_waitall_local: handle[%d]->req: %d", i, handle[i]->request); mpi_req[r_n] = handle[i]->request; r_n++; } } /* * Wait for local completion of MPI requests: */ DART_LOG_DEBUG("dart_waitall_local: " "MPI_Waitall, %d requests from %zu handles", r_n, num_handles); if (r_n > 0) { if (MPI_Waitall(r_n, mpi_req, mpi_sta) == MPI_SUCCESS) { DART_LOG_DEBUG("dart_waitall_local: MPI_Waitall completed"); } else { DART_LOG_ERROR("dart_waitall_local: MPI_Waitall failed"); DART_LOG_TRACE("dart_waitall_local: free MPI_Request temporaries"); free(mpi_req); DART_LOG_TRACE("dart_waitall_local: free MPI_Status temporaries"); free(mpi_sta); return DART_ERR_INVAL; } } else { DART_LOG_DEBUG("dart_waitall_local > number of requests = 0"); return DART_OK; } /* * copy MPI requests back to DART handles: */ DART_LOG_TRACE("dart_waitall_local: " "copying MPI requests back to DART handles"); r_n = 0; for (i = 0; i < num_handles; i++) { if (handle[i]) { DART_LOG_TRACE("dart_waitall_local: -- mpi_sta[%d].MPI_SOURCE: %d", r_n, mpi_sta[r_n].MPI_SOURCE); DART_LOG_TRACE("dart_waitall_local: -- mpi_sta[%d].MPI_ERROR: %d:%s", r_n, mpi_sta[r_n].MPI_ERROR, DART__MPI__ERROR_STR(mpi_sta[r_n].MPI_ERROR)); if (mpi_req[r_n] != MPI_REQUEST_NULL) { if (mpi_sta[r_n].MPI_ERROR == MPI_SUCCESS) { DART_LOG_TRACE("dart_waitall_local: -- MPI_Request_free"); if (MPI_Request_free(&mpi_req[r_n]) != MPI_SUCCESS) { DART_LOG_TRACE("dart_waitall_local ! MPI_Request_free failed"); free(mpi_req); free(mpi_sta); return DART_ERR_INVAL; } } else { DART_LOG_TRACE("dart_waitall_local: cannot free request %d " "mpi_sta[%d] = %d (%s)", r_n, r_n, mpi_sta[r_n].MPI_ERROR, DART__MPI__ERROR_STR(mpi_sta[r_n].MPI_ERROR)); } } DART_LOG_DEBUG("dart_waitall_local: free handle[%d] %p", i, (void*)(handle[i])); free(handle[i]); handle[i] = NULL; r_n++; } } DART_LOG_TRACE("dart_waitall_local: free MPI_Request temporaries"); free(mpi_req); DART_LOG_TRACE("dart_waitall_local: free MPI_Status temporaries"); free(mpi_sta); } DART_LOG_DEBUG("dart_waitall_local > finished"); return DART_OK; }
int main (int argc, char **argv) { int nprocs = -1; int rank = -1; char processor_name[128]; int namelen = 128; int buf0[buf_size]; int buf1[buf_size]; MPI_Request aReq[2]; MPI_Status aStatus[2]; MPI_Status status; /* init */ MPI_Init (&argc, &argv); MPI_Comm_size (MPI_COMM_WORLD, &nprocs); MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Get_processor_name (processor_name, &namelen); printf ("(%d) is alive on %s\n", rank, processor_name); fflush (stdout); MPI_Barrier (MPI_COMM_WORLD); if (nprocs < 2) { printf ("not enough tasks\n"); } else { if (rank == 0) { memset (buf0, 0, buf_size); MPI_Send_init (buf0, buf_size, MPI_INT, 1, 0, MPI_COMM_WORLD, &aReq[0]); MPI_Recv_init (buf1, buf_size, MPI_INT, 1, 0, MPI_COMM_WORLD, &aReq[1]); MPI_Start (&aReq[0]); MPI_Start (&aReq[1]); MPI_Waitall (2, aReq, aStatus); memset (buf0, 1, buf_size); MPI_Startall (2, aReq); MPI_Waitall (2, aReq, aStatus); } else if (rank == 1) { memset (buf1, 1, buf_size); MPI_Recv_init (buf0, buf_size, MPI_INT, 0, 0, MPI_COMM_WORLD, &aReq[0]); MPI_Send_init (buf1, buf_size, MPI_INT, 0, 0, MPI_COMM_WORLD, &aReq[1]); MPI_Start (&aReq[0]); MPI_Start (&aReq[1]); MPI_Waitall (2, aReq, aStatus); memset (buf1, 0, buf_size); MPI_Startall (2, aReq); MPI_Waitall (2, aReq, aStatus); } } MPI_Barrier (MPI_COMM_WORLD); MPI_Request_free (&aReq[0]); MPI_Request_free (&aReq[1]); MPI_Finalize (); printf ("(%d) Finished normally\n", rank); }
int main (int argc, char** argv) { assert(argc == 2); int coordinate_steps = atoi(argv[1]); double length = 1.0; double total_time = 0.6; double sigma = 0.3; double lambda = 1.0; double boundary_value = 0.0; double coordinate_step = length / coordinate_steps; double time_step = sigma * coordinate_step / lambda; int time_steps = total_time / time_step; int rv = MPI_Init(&argc, &argv); assert(rv == MPI_SUCCESS); int size; rv = MPI_Comm_size(MPI_COMM_WORLD, &size); assert(rv == MPI_SUCCESS); int rank; rv = MPI_Comm_rank(MPI_COMM_WORLD, &rank); assert(rv == MPI_SUCCESS); MPI_Request request; MPI_Status status; std::vector<double> current_values(coordinate_steps, 0.0); std::vector<double> previous_values(coordinate_steps, 0.0); double start_time = MPI_Wtime(); if (rank == 0) { for (int s = 0; s <= coordinate_steps; s++) { double coordinate = static_cast<double>(s) / coordinate_steps * length; if (coordinate >= 0.1 && coordinate <= 0.2) { current_values[s] = 1.0; } } } if (rank == 0) { for (int s = 0; s <= coordinate_steps; s++) { rv = MPI_Isend(¤t_values[s], 1, MPI_DOUBLE, Next(rank, size), s, MPI_COMM_WORLD, &request); assert(rv == MPI_SUCCESS); rv = MPI_Request_free(&request); assert(rv == MPI_SUCCESS); } } for (int i = (rank == 0) ? 1 : 0; i * size + rank <= time_steps; i++) { for (int s = 0; s <= coordinate_steps; s++) { rv = MPI_Recv(&previous_values[s], 1, MPI_DOUBLE, Prev(rank, size), s, MPI_COMM_WORLD, &status); assert(rv == MPI_SUCCESS); double left_value = (s == 0) ? boundary_value : current_values[s - 1]; double left_down_value = (s == 0) ? boundary_value : previous_values[s - 1]; double down_value = previous_values[s]; current_values[s] = CalculateValue(sigma, left_value, down_value, left_down_value); int receiver = (i * size + rank == time_steps) ? 0 : Next(rank, size); rv = MPI_Isend(¤t_values[s], 1, MPI_DOUBLE, receiver, s, MPI_COMM_WORLD, &request); assert(rv == MPI_SUCCESS); rv = MPI_Request_free(&request); assert(rv == MPI_SUCCESS); } } if (rank == 0) { for (int s = 0; s <= coordinate_steps; s++) { int sender = time_steps % size; rv = MPI_Recv(&previous_values[s], 1, MPI_DOUBLE, sender, s, MPI_COMM_WORLD, &status); assert(rv == MPI_SUCCESS); } } if (rank == 0 && coordinate_steps <= 100) { for (int s = 0; s <= coordinate_steps; s++) { double coordinate = static_cast<double>(s) / coordinate_steps * length; printf("%lf %lf\n", coordinate, previous_values[s]); } } if (rank == 0) { double current_time = MPI_Wtime(); printf("%d %lf\n", size, current_time - start_time); } rv = MPI_Finalize(); assert(rv == MPI_SUCCESS); return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { MPI_Request r; MPI_Status s; // int flag; int buf[10]; int rbuf[10]; int tag = 27; int dest = 0; int rank, size; MPI_Init( &argc, &argv ); MPI_Comm_size( MPI_COMM_WORLD, &size ); MPI_Comm_rank( MPI_COMM_WORLD, &rank ); /* Create a persistent send request */ // tout le monde prépare l'envoi à 0 MPI_Send_init( buf, 10, MPI_INT, dest, tag, MPI_COMM_WORLD, &r ); /* Use that request */ if (rank == 0) { // on alloue un tableau de size request pour les irecv MPI_Request *rr = (MPI_Request *)malloc(size * sizeof(MPI_Request)); for (int i=0; i<size; i++) { // 0 va recevoir de tout le monde MPI_Irecv( rbuf, 10, MPI_INT, i, tag, MPI_COMM_WORLD, &rr[i] ); } // 0 va envoyer à 0 MPI_Start( &r ); // 0 envoi à 0 MPI_Wait( &r, &s ); // 0 recoit de tout le monde MPI_Waitall( size, rr, MPI_STATUSES_IGNORE ); free(rr); } else { // non-0 va envoyer à 0 MPI_Start( &r ); // non-0 envoi à 0 MPI_Wait( &r, &s ); } MPI_Request_free( &r ); // if (rank == 0) // { // MPI_Request sr; // /* Create a persistent receive request */ // // 0 prépare la récéption de tout le monde // MPI_Recv_init( rbuf, 10, MPI_INT, MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &r ); // // 0 va envoyer à 0 // MPI_Isend( buf, 10, MPI_INT, 0, tag, MPI_COMM_WORLD, &sr ); // for (int i=0; i<size; i++) { // // 0 va recevoir de tout le monde // MPI_Start( &r ); // // 0 recoit de tout le monde // MPI_Wait( &r, &s ); // } // // 0 envoi à 0 // MPI_Wait( &sr, &s ); // MPI_Request_free( &r ); // } // else { // // non-0 envoi à 0 // MPI_Send( buf, 10, MPI_INT, 0, tag, MPI_COMM_WORLD ); // } MPI_Finalize(); return 0; }
void _amps_wait_exchange(amps_Handle handle) { int i; int num; num = handle -> package -> num_send + handle -> package -> num_recv; if(num) { if(handle -> package -> num_recv) { for(i = 0; i < handle -> package -> num_recv; i++) { AMPS_CLEAR_INVOICE(handle -> package -> recv_invoices[i]); } } MPI_Waitall(num, handle -> package -> recv_requests, handle -> package -> status); } #ifdef AMPS_MPI_PACKAGE_LOWSTORAGE /* Needed by the DEC's; need better memory allocation strategy */ /* Need to uncommit packages when not in use */ /* amps_Commit followed by amps_UnCommit ????? */ if(handle -> package -> commited) { for(i = 0; i < handle -> package -> num_recv; i++) { if( handle -> package -> recv_invoices[i] -> mpi_type != MPI_DATATYPE_NULL ) { MPI_Type_free(&(handle -> package -> recv_invoices[i] -> mpi_type)); } MPI_Request_free(&(handle -> package -> recv_requests[i])); } for(i = 0; i < handle -> package -> num_send; i++) { if( handle -> package -> send_invoices[i] -> mpi_type != MPI_DATATYPE_NULL ) { MPI_Type_free(&handle -> package -> send_invoices[i] -> mpi_type); } MPI_Request_free(&(handle -> package -> send_requests[i])); } if(handle -> package -> recv_requests) { free(handle -> package -> recv_requests); handle -> package -> recv_requests = NULL; } if(handle -> package -> status) { free(handle -> package -> status); handle -> package -> status = NULL; } handle -> package -> commited = FALSE; } #endif }