/* portals developers' note: armci fence is not guaranteed to be correct unless PUT_START events are captured PUT_ENDs do NOT guarantee order; only PUT_STARTs */ void PARMCI_AllFence() { #if defined(CLUSTER) { int p; for(p=0;p<armci_nproc;p++)PARMCI_Fence(p); } #endif MEM_FENCE; }
void ARMCI_Fence(int proc) { if (!SAMECLUSNODE(proc)) armci_profile_start(ARMCI_PROF_FENCE); PARMCI_Fence(proc); if (!SAMECLUSNODE(proc)) armci_profile_stop(ARMCI_PROF_FENCE); }
void ARMCI_Fence(int proc) { static double stime, etime; stime = TIME(); PARMCI_Fence(proc); etime = TIME(); ARMCI_Fence_t += etime - stime; }
/** One-sided copy of data from the source to the destination. Set a flag on * the remote process when the transfer is complete. * * @param[in] src Source buffer * @param[in] dst Destination buffer on proc * @param[in] size Number of bytes to transfer * @param[in] flag Address of the flag buffer on proc * @param[in] value Value to set the flag to * @param[in] proc Process id of the target * @return 0 on success, non-zero on failure */ int PARMCI_Put_flag(void *src, void* dst, int size, int *flag, int value, int proc) { /* TODO: This can be optimized with a more direct implementation, especially in the * case where RMA is ordered; in that case, the Fence (Flush) is not necessary. */ PARMCI_Put(src, dst, size, proc); PARMCI_Fence(proc); PARMCI_Put(&value, flag, sizeof(int), proc); return 0; }
/** Blocking operation that transfers data from the calling process to the * memory of the remote process. The data transfer is strided and blocking. * After the transfer completes, the given flag is set on the remote process. * * @param[in] src_ptr Source starting address of the data block to put. * @param[in] src_stride_arr Source array of stride distances in bytes. * @param[in] dst_ptr Destination starting address to put data. * @param[in] dst_stride_ar Destination array of stride distances in bytes. * @param[in] count Block size in each dimension. count[0] should be the * number of bytes of contiguous data in leading dimension. * @param[in] stride_levels The level of strides. * @param[in] flag Location of the flag buffer * @param[in] value Value to set the flag to * @param[in] proc Remote process ID (destination). * * @return Zero on success, error code otherwise. */ int PARMCI_PutS_flag(void *src_ptr, int src_stride_ar[/*stride_levels*/], void *dst_ptr, int dst_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, int *flag, int value, int proc) { PARMCI_PutS(src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc); PARMCI_Fence(proc); PARMCI_Put(&value, flag, sizeof(int), proc); return 1; }
void PARMCI_AllFence() { #if defined(ARMCIX) ARMCIX_AllFence (); #elif defined(BGML) BGML_WaitAll(); #elif defined(LAPI) || defined(CLUSTER) int p; for(p = 0;p < armci_nproc; p++) { PARMCI_Fence(p); } #endif MEM_FENCE; }
/*\ Send Request to Execute callback function in a global address space * Arguments: * f - handle to the callback function * p - remote processor * hdr - header data - used to pack extra args for callback (local buffer) * hlen - size of header data < ARMCI_GPC_HLEN * data - bulk data passed to callback (local buffer) * dlen - length of bulk data * rhdr - ptr to reply header (return args from callback) * rhlen - length of buffer to store reply header < ARMCI_GPC_HLEN * rdata - ptr to where reply data from callback should be stored (local buf) * rdlen - size of the buffer to store reply data * nbh - nonblocking handle * \*/ int ARMCI_Gpc_exec(int h, int p, void *hdr, int hlen, void *data, int dlen, void *rhdr, int rhlen, void *rdata, int rdlen, gpc_hdl_t* nbh) { int hnd = -h + GPC_OFFSET; int err = 0; armci_hdl_t *ahdl = (nbh ? &(nbh->ahdl): NULL); if(hnd <0 || hnd>= GPC_SLOTS) err += fprintf(stderr, "ARMCI_Gpc_exec: bad callback handle %d: %d\n",hnd,GPC_SLOTS); if(!_table[hnd]) err += fprintf(stderr, "ARMCI_Gpc_exec: NULL function %d",hnd); if(hlen<0 || hlen>=ARMCI_Gpc_get_hlen()) err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send header size %d %d\n", hlen, ARMCI_Gpc_get_hlen()); if(rhlen<0 || rhlen>=ARMCI_Gpc_get_hlen()) err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv header size %d %d\n", rhlen, ARMCI_Gpc_get_hlen()); if(dlen<0 || dlen>=ARMCI_Gpc_get_dlen()) err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send data size %d %d\n", dlen, ARMCI_Gpc_get_dlen()); if(rdlen<0 || rdlen>=ARMCI_Gpc_get_dlen()) err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv data size %d %d\n", rdlen, ARMCI_Gpc_get_dlen()); if(hlen>0 && hdr==NULL) err += fprintf(stderr, "ARMCI_Gpc_exec: Null send header for non-zero header size %d\n", hlen); if(rhlen>0 && rhdr==NULL) err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv header for non-zero header size %d\n", rhlen); if(dlen>0 && data==NULL) err += fprintf(stderr, "ARMCI_Gpc_exec: Null send data for non-zero data size %d\n", dlen); if(rdlen>0 && rdata==NULL) err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv data for non-zero header size %d\n", rdlen); if(p<0 || p >= armci_nproc) err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid target processor id %d\n", p, armci_nproc); if(err) return FAIL; if(rhlen + rdlen == 0) armci_die("Zero reply header + data length not yet supported", 0); if(nbh) nbh->proc = p; #if 1 if(SAMECLUSNODE(p) && armci_nproc==1) { int rhsize, rdsize; int (*func)(); /* fprintf(stderr, "%d:: armci gpc exec. SAMECLUSNODE\n", armci_me); */ func = _table[hnd]; if(func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, rdata, rdlen, &rdsize, GPC_INIT) != GPC_DONE) { func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, rdata, rdlen, &rdsize, GPC_WAIT); } #ifndef VAPI PARMCI_Fence(p); #endif return 0; } #endif /* fprintf(stderr, "%d:: armci gpc exec. invoking armci gpc\n", armci_me); */ return armci_gpc(h, p, hdr, hlen, data, dlen, rhdr, rhlen, rdata, rdlen, ahdl); }
void ARMCI_Fence(int proc) { PARMCI_Fence(proc); return; }
int main(int argc, char *argv[]) { int rank, size; int provided; #if defined(__bgp__) MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); assert(provided==MPI_THREAD_MULTIPLE); #else MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided); //assert(provided>MPI_THREAD_SINGLE); #endif MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); assert( size > 1 ); PARMCI_Init_args(&argc, &argv); int w, maxwinsize = ( argc > 1 ? atoi(argv[1]) : 1000000 ); if ( rank == 0 ) printf( "size = %d maxwinsize = %d doubles\n", size, maxwinsize ); for ( w = 1 ; w < maxwinsize ; w *= 2 ) { double ** window; window = (double **) PARMCI_Malloc_local( size * sizeof(double *) ); PARMCIX_Malloc_comm(MPI_COMM_WORLD, (void **) window, w * sizeof(double) ); for (int i = 0; i < w; i++) window[rank][i] = 0.0; double * buffer; buffer = (double *) PARMCI_Malloc_local( w * sizeof(double) ); PARMCIX_Barrier_comm(MPI_COMM_WORLD); if (rank == 0) for (int t=1; t<size; t+=2) { int bytes = w * sizeof(double); for (int i = 0; i < w; i++) buffer[i] = (double)(t); PARMCI_Put( buffer, window[t], bytes, t ); PARMCI_Fence( t ); for (int i = 0; i < w; i++) buffer[i] = 0.0; PARMCI_Get( window[t], buffer, bytes, t ); int errors = 0; for (int i = 0; i < w; i++) if ( buffer[i] != (double)(t) ) errors++; if ( errors > 0 ) for (int i = 0; i < w; i++) printf("rank %d buffer[%d] = %lf \n", rank, i, buffer[i] ); } PARMCIX_Barrier_comm(MPI_COMM_WORLD); if (rank != 0) { int errors = 0; for (int i = 0; i < w; i++) if ( window[rank][i] != (double)(rank) ) errors++; if ( errors > 0 ) for (int i = 0; i < w; i++) printf("rank %d window[%d][%d] = %lf \n", rank, rank, i, window[rank][i] ); } PARMCIX_Barrier_comm(MPI_COMM_WORLD); if (rank == 0) for (int t=1; t<size; t++) { int bytes = w * sizeof(double); double t0, t1, t2, dt1, dt2, bw1, bw2; for (int i = 0; i < w; i++) buffer[i] = (double)(-1); t0 = MPI_Wtime(); PARMCI_Put( buffer, window[t], bytes, t ); t1 = MPI_Wtime(); PARMCI_Fence( t ); t2 = MPI_Wtime(); dt1 = t1 - t0; dt2 = t2 - t0; bw1 = bytes / dt1; bw2 = bytes / dt2; bw1 /= 1000000.0; bw2 /= 1000000.0; printf("PARMCI_Put of from rank %4d to rank %4d of %9d bytes - local: %lf s (%lf MB/s) remote: %lf s (%lf MB/s) \n", t, 0, bytes, dt1, bw1, dt2, bw2); fflush(stdout); } PARMCIX_Barrier_comm(MPI_COMM_WORLD); PARMCI_Free_local( (void *) buffer ); PARMCIX_Free_comm(MPI_COMM_WORLD, (void *) window[rank] ); PARMCI_Free_local( (void *) window ); } PARMCI_Finalize(); printf("%d: all done \n", rank ); fflush(stdout); MPI_Finalize(); return 0; }
int PARMCI_WaitProc(int proc) { fprintf(stderr,"WARNING: PARMCI_WaitProc(int proc) only synchronizes implicit nonblocking operations! \n"); PARMCI_Fence(proc); return(0); }