/** Perform atomic read-modify-write on the given integer or long location and * return the location's original value. * * \note ARMCI RMW operations are atomic with respect to other RMW operations, * but not with respect to other one-sided operations (get, put, acc, etc). * * @param[in] op Operation to be performed: * ARMCI_FETCH_AND_ADD (int) * ARMCI_FETCH_AND_ADD_LONG * ARMCI_SWAP (int) * ARMCI_SWAP_LONG * @param[out] ploc Location to store the original value. * @param[in] prem Location on which to perform atomic operation. * @param[in] value Value to add to remote location (ignored for swap). * @param[in] proc Process rank for the target buffer. */ int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) { int is_long; gmr_t *mreg; mreg = gmr_lookup(prem, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); if (op == ARMCI_SWAP_LONG || op == ARMCI_FETCH_AND_ADD_LONG) is_long = 1; else is_long = 0; if (op == ARMCI_SWAP || op == ARMCI_SWAP_LONG) { long swap_val_l; int swap_val_i; ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc); PARMCI_Get(prem, is_long ? (void*) &swap_val_l : (void*) &swap_val_i, is_long ? sizeof(long) : sizeof(int), proc); PARMCI_Put(ploc, prem, is_long ? sizeof(long) : sizeof(int), proc); ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc); if (is_long) *(long*) ploc = swap_val_l; else *(int*) ploc = swap_val_i; } else if (op == ARMCI_FETCH_AND_ADD || op == ARMCI_FETCH_AND_ADD_LONG) { long fetch_val_l, new_val_l; int fetch_val_i, new_val_i; ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc); PARMCI_Get(prem, is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i, is_long ? sizeof(long) : sizeof(int), proc); if (is_long) new_val_l = fetch_val_l + value; else new_val_i = fetch_val_i + value; PARMCI_Put(is_long ? (void*) &new_val_l : (void*) &new_val_i, prem, is_long ? sizeof(long) : sizeof(int), proc); ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc); if (is_long) *(long*) ploc = fetch_val_l; else *(int*) ploc = fetch_val_i; } else { ARMCII_Error("invalid operation (%d)", op); } return 0; }
/** One-sided copy of data from the source to the destination. Set a flag on * the remote process when the transfer is complete. * * @param[in] src Source buffer * @param[in] dst Destination buffer on proc * @param[in] size Number of bytes to transfer * @param[in] flag Address of the flag buffer on proc * @param[in] value Value to set the flag to * @param[in] proc Process id of the target * @return 0 on success, non-zero on failure */ int PARMCI_Put_flag(void *src, void* dst, int size, int *flag, int value, int proc) { /* TODO: This can be optimized with a more direct implementation, especially in the * case where RMA is ordered; in that case, the Fence (Flush) is not necessary. */ PARMCI_Put(src, dst, size, proc); PARMCI_Fence(proc); PARMCI_Put(&value, flag, sizeof(int), proc); return 0; }
int PARMCI_Acc(int datatype, void *scale, void *src_ptr, void *dst_ptr, int bytes, int proc) { double *get_buf = (double *)l_state.acc_buf; double *_src_buf = (double *)src_ptr; double calc_scale = *(double *)scale; int m, limit; assert(bytes <= l_state.acc_buf_len); assert(datatype == ARMCI_ACC_DBL); assert(get_buf); ARMCID_network_lock(proc); PARMCI_Get(dst_ptr, get_buf, bytes, proc); for (m=0, limit=bytes/sizeof(double); m<limit; ++m) { if (calc_scale == 1.0) { get_buf[m] += _src_buf[m]; } else { get_buf[m] += calc_scale * _src_buf[m]; } } PARMCI_Put(get_buf, dst_ptr, bytes, proc); ARMCID_network_unlock(proc); return 0; }
int ARMCI_Put(void *src, void *dst, int bytes, int proc) { int ret; armci_profile_start_strided(&bytes, 0, proc, ARMCI_PROF_PUT); ret = PARMCI_Put(src, dst, bytes, proc); armci_profile_stop_strided(ARMCI_PROF_PUT); return ret; }
int ARMCI_Put(void *src, void *dst, int bytes, int proc) { int rval; static double stime, etime; stime = TIME(); rval = PARMCI_Put(src, dst, bytes, proc); etime = TIME(); ARMCI_Put_t += etime - stime; return rval; }
/** Blocking operation that transfers data from the calling process to the * memory of the remote process. The data transfer is strided and blocking. * After the transfer completes, the given flag is set on the remote process. * * @param[in] src_ptr Source starting address of the data block to put. * @param[in] src_stride_arr Source array of stride distances in bytes. * @param[in] dst_ptr Destination starting address to put data. * @param[in] dst_stride_ar Destination array of stride distances in bytes. * @param[in] count Block size in each dimension. count[0] should be the * number of bytes of contiguous data in leading dimension. * @param[in] stride_levels The level of strides. * @param[in] flag Location of the flag buffer * @param[in] value Value to set the flag to * @param[in] proc Remote process ID (destination). * * @return Zero on success, error code otherwise. */ int PARMCI_PutS_flag(void *src_ptr, int src_stride_ar[/*stride_levels*/], void *dst_ptr, int dst_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, int *flag, int value, int proc) { PARMCI_PutS(src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc); PARMCI_Fence(proc); PARMCI_Put(&value, flag, sizeof(int), proc); return 1; }
static void armci_generic_unlock(int mutex, int proc) { int *mutex_ticket= glob_mutex[proc].turn + mutex; int *newval = glob_mutex[proc].tickets +mutex; int len=sizeof(int); /* update ticket for next process requesting this mutex */ (*newval) ++; /* write new ticket value stored previously in tickets */ PARMCI_Put(newval, mutex_ticket, len, proc); MEM_FENCE; }
int ARMCI_Put(void *src, void *dst, int size, int target) { return PARMCI_Put(src, dst, size, target); }
int ARMCI_Put(void *src, void *dst, int size, int target) { parmci_calls++; return PARMCI_Put(src, dst, size, target); }
int main(int argc, char *argv[]) { int rank, size; int provided; #if defined(__bgp__) MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); assert(provided==MPI_THREAD_MULTIPLE); #else MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided); //assert(provided>MPI_THREAD_SINGLE); #endif MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); assert( size > 1 ); PARMCI_Init_args(&argc, &argv); int w, maxwinsize = ( argc > 1 ? atoi(argv[1]) : 1000000 ); if ( rank == 0 ) printf( "size = %d maxwinsize = %d doubles\n", size, maxwinsize ); for ( w = 1 ; w < maxwinsize ; w *= 2 ) { double ** window; window = (double **) PARMCI_Malloc_local( size * sizeof(double *) ); PARMCIX_Malloc_comm(MPI_COMM_WORLD, (void **) window, w * sizeof(double) ); for (int i = 0; i < w; i++) window[rank][i] = 0.0; double * buffer; buffer = (double *) PARMCI_Malloc_local( w * sizeof(double) ); PARMCIX_Barrier_comm(MPI_COMM_WORLD); if (rank == 0) for (int t=1; t<size; t+=2) { int bytes = w * sizeof(double); for (int i = 0; i < w; i++) buffer[i] = (double)(t); PARMCI_Put( buffer, window[t], bytes, t ); PARMCI_Fence( t ); for (int i = 0; i < w; i++) buffer[i] = 0.0; PARMCI_Get( window[t], buffer, bytes, t ); int errors = 0; for (int i = 0; i < w; i++) if ( buffer[i] != (double)(t) ) errors++; if ( errors > 0 ) for (int i = 0; i < w; i++) printf("rank %d buffer[%d] = %lf \n", rank, i, buffer[i] ); } PARMCIX_Barrier_comm(MPI_COMM_WORLD); if (rank != 0) { int errors = 0; for (int i = 0; i < w; i++) if ( window[rank][i] != (double)(rank) ) errors++; if ( errors > 0 ) for (int i = 0; i < w; i++) printf("rank %d window[%d][%d] = %lf \n", rank, rank, i, window[rank][i] ); } PARMCIX_Barrier_comm(MPI_COMM_WORLD); if (rank == 0) for (int t=1; t<size; t++) { int bytes = w * sizeof(double); double t0, t1, t2, dt1, dt2, bw1, bw2; for (int i = 0; i < w; i++) buffer[i] = (double)(-1); t0 = MPI_Wtime(); PARMCI_Put( buffer, window[t], bytes, t ); t1 = MPI_Wtime(); PARMCI_Fence( t ); t2 = MPI_Wtime(); dt1 = t1 - t0; dt2 = t2 - t0; bw1 = bytes / dt1; bw2 = bytes / dt2; bw1 /= 1000000.0; bw2 /= 1000000.0; printf("PARMCI_Put of from rank %4d to rank %4d of %9d bytes - local: %lf s (%lf MB/s) remote: %lf s (%lf MB/s) \n", t, 0, bytes, dt1, bw1, dt2, bw2); fflush(stdout); } PARMCIX_Barrier_comm(MPI_COMM_WORLD); PARMCI_Free_local( (void *) buffer ); PARMCIX_Free_comm(MPI_COMM_WORLD, (void *) window[rank] ); PARMCI_Free_local( (void *) window ); } PARMCI_Finalize(); printf("%d: all done \n", rank ); fflush(stdout); MPI_Finalize(); return 0; }
int PARMCI_NbPut(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl) { int rc; rc = PARMCI_Put(src, dst, bytes, proc); return rc; }
int armci_pack_vector(int op, void *scale, armci_giov_t darr[],int len, int proc,armci_ihdl_t nb_handle) { armci_giov_t extra; /* keeps data remainder of set to be processed in chunks */ armci_giov_t save; /* keeps original value of set to be processed in chunks */ armci_giov_t *ndarr; /* points to first array element to be processed now */ int rc=0, nlen, count=0; armcip_init_giov_t(&extra); armcip_init_giov_t(&save); ndarr = darr; save.src_ptr_array=NULL; /* indicates that save slot is empty */ while(len){ armci_split_dscr_array(ndarr, len, &extra, &nlen, &save); # if defined(REMOTE_OP) /* A problem will occur if len is 1 and nlen is 0. This corresponds to a * situation where the size of an individual element is found to exceed * BUFSIZE1. Treat this as a single transfer of contiguous data using * the standard PARMCI_Get/Put/Acc call */ if (len == 1 && nlen == 0) { if(ARMCI_ACC(op))rc=PARMCI_Acc(op, scale, ndarr[0].src_ptr_array[0], ndarr[0].dst_ptr_array[0],ndarr[0].bytes, proc); else if(op == GET)rc=PARMCI_Get(ndarr[0].src_ptr_array[0], ndarr[0].dst_ptr_array[0],ndarr[0].bytes, proc); else if(op == PUT)rc=PARMCI_Put(ndarr[0].src_ptr_array[0], ndarr[0].dst_ptr_array[0],ndarr[0].bytes, proc); else armci_die("Unknown op in armci_pack_vector",op); nlen = 1; } else { rc = armci_rem_vector(op, scale, ndarr,nlen,proc,0,nb_handle); } # else if(ARMCI_ACC(op))rc=armci_acc_vector(op,scale,ndarr,nlen,proc); else rc = armci_copy_vector(op,ndarr,nlen,proc); # endif if(rc) break; /* non-NULL pointer indicates that set was split */ if(extra.src_ptr_array){ if(nb_handle) { nb_handle->bufid = NB_MULTI; /*can be set multiple times here; but not reset here*/ } ndarr[nlen-1]=extra; /* set the pointer to remainder of last set */ nlen--; /* since last set not done in full need to process it again */ }else{ if(save.src_ptr_array){ ndarr[0]=save; save.src_ptr_array=NULL; /* indicates that save slot is empty */ } if(nlen == 0) armci_die("vector packetization problem:buffer too small",BUFSIZE1); } len -=nlen; ndarr +=nlen; count ++; } return rc; }