Пример #1
0
/** Perform atomic read-modify-write on the given integer or long location and
  * return the location's original value.
  *
  * \note ARMCI RMW operations are atomic with respect to other RMW operations,
  * but not with respect to other one-sided operations (get, put, acc, etc).
  *
  * @param[in]  op    Operation to be performed:
  *                     ARMCI_FETCH_AND_ADD (int)
  *                     ARMCI_FETCH_AND_ADD_LONG
  *                     ARMCI_SWAP (int)
  *                     ARMCI_SWAP_LONG
  * @param[out] ploc  Location to store the original value.
  * @param[in]  prem  Location on which to perform atomic operation.
  * @param[in]  value Value to add to remote location (ignored for swap).
  * @param[in]  proc  Process rank for the target buffer.
  */
int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) {
  int           is_long;
  gmr_t *mreg;

  mreg = gmr_lookup(prem, proc);
  ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer");

  if (op == ARMCI_SWAP_LONG || op == ARMCI_FETCH_AND_ADD_LONG)
    is_long = 1;
  else
    is_long = 0;

  if (op == ARMCI_SWAP || op == ARMCI_SWAP_LONG) {
    long swap_val_l;
    int  swap_val_i;

    ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc);
    PARMCI_Get(prem, is_long ? (void*) &swap_val_l : (void*) &swap_val_i, 
              is_long ? sizeof(long) : sizeof(int), proc);
    PARMCI_Put(ploc, prem, is_long ? sizeof(long) : sizeof(int), proc);
    ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc);

    if (is_long)
      *(long*) ploc = swap_val_l;
    else
      *(int*) ploc = swap_val_i;
  }

  else if (op == ARMCI_FETCH_AND_ADD || op == ARMCI_FETCH_AND_ADD_LONG) {
    long fetch_val_l, new_val_l;
    int  fetch_val_i, new_val_i;
    
    ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc);
    PARMCI_Get(prem, is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i,
              is_long ? sizeof(long) : sizeof(int), proc);
    
    if (is_long)
      new_val_l = fetch_val_l + value;
    else
      new_val_i = fetch_val_i + value;

    PARMCI_Put(is_long ? (void*) &new_val_l : (void*) &new_val_i, prem, 
              is_long ? sizeof(long) : sizeof(int), proc);
    ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc);

    if (is_long)
      *(long*) ploc = fetch_val_l;
    else
      *(int*) ploc = fetch_val_i;
  }

  else {
    ARMCII_Error("invalid operation (%d)", op);
  }

  return 0;
}
Пример #2
0
int   PARMCI_Acc(int datatype, void *scale,
                 void *src_ptr,
                 void *dst_ptr,
                 int bytes, int proc)
{
    double *get_buf = (double *)l_state.acc_buf;
    double *_src_buf = (double *)src_ptr;
    double calc_scale = *(double *)scale;
    int m, limit;


    assert(bytes <= l_state.acc_buf_len);
    assert(datatype == ARMCI_ACC_DBL);
    assert(get_buf);

    ARMCID_network_lock(proc);
    PARMCI_Get(dst_ptr, get_buf, bytes, proc);

    for (m=0, limit=bytes/sizeof(double); m<limit; ++m) {
        if (calc_scale == 1.0) {
            get_buf[m] += _src_buf[m];
        }
        else {
            get_buf[m] += calc_scale * _src_buf[m];
        }
    }

    PARMCI_Put(get_buf, dst_ptr, bytes, proc);
    ARMCID_network_unlock(proc);
    
    return 0;
}
Пример #3
0
static void armci_generic_lock(int mutex, int proc)
{
    int i, myturn, factor=0, len=sizeof(int);
    int  *mutex_ticket, next_in_line;

    mutex_ticket= glob_mutex[proc].turn + mutex;
    myturn = register_in_mutex_queue(mutex, proc);

    /* code to reduce cost of unlocking mutex on the same SMP node goes here
     * lockinfo_node[me].ticket = mutex_ticket;
     * lockinfo_node[me].mutex  = mutex;
     */

    _dummy_work_ = 0.; /* must be global to fool the compiler */
    do {

        PARMCI_Get(mutex_ticket, &next_in_line, len, proc);
        if(next_in_line > myturn)
            armci_die2("armci: problem with tickets",myturn,next_in_line);

        /* apply a linear backoff delay before retrying  */
        for(i=0; i<  SPINMAX * factor; i++) _dummy_work_ += 1.;

        factor += 1;

    } while (myturn != next_in_line);

    glob_mutex[proc].tickets[mutex] = myturn; /* save ticket value */
}
Пример #4
0
int ARMCI_Get(void *src, void *dst, int bytes, int proc)
{
    int ret;
    armci_profile_start_strided(&bytes, 0, proc, ARMCI_PROF_GET);
    ret = PARMCI_Get(src, dst, bytes, proc);
    armci_profile_stop_strided(ARMCI_PROF_GET);
    return ret;
}
Пример #5
0
int ARMCI_Get(void *src, void *dst, int bytes, int proc)
{
    int rval;
    static double stime, etime;
    stime = TIME();
    rval = PARMCI_Get(src, dst, bytes, proc);
    etime = TIME();
    ARMCI_Get_t += etime - stime;
    return rval;
}
Пример #6
0
int ARMCI_Get(void *src, void *dst, int size, int target) {
  return PARMCI_Get(src, dst, size, target);
}
Пример #7
0
int ARMCI_Get(void *src, void *dst, int size, int target) {
    parmci_calls++;
    return PARMCI_Get(src, dst, size, target);
}
Пример #8
0
int main(int argc, char *argv[])
{
    int rank, size;
    int provided;

#if defined(__bgp__)
    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    assert(provided==MPI_THREAD_MULTIPLE);
#else
    MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
    //assert(provided>MPI_THREAD_SINGLE);
#endif
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    assert( size > 1 );

    PARMCI_Init_args(&argc, &argv);

    int w, maxwinsize = ( argc > 1 ? atoi(argv[1]) : 1000000 );

    if ( rank == 0 ) printf( "size = %d maxwinsize = %d doubles\n", size, maxwinsize );

    for ( w = 1 ; w < maxwinsize ; w *= 2 )
    {
        double ** window;
        window  = (double **) PARMCI_Malloc_local( size * sizeof(double *) );
        PARMCIX_Malloc_comm(MPI_COMM_WORLD, (void **) window, w * sizeof(double) );
        for (int i = 0; i < w; i++) window[rank][i] = 0.0;

        double * buffer;
        buffer = (double *) PARMCI_Malloc_local(  w * sizeof(double) );

        PARMCIX_Barrier_comm(MPI_COMM_WORLD);

        if (rank == 0)
            for (int t=1; t<size; t+=2)
            {
                int bytes = w * sizeof(double);

                for (int i = 0; i < w; i++) buffer[i] = (double)(t);

                PARMCI_Put( buffer, window[t], bytes, t );
                PARMCI_Fence( t );

                for (int i = 0; i < w; i++) buffer[i] = 0.0;

                PARMCI_Get( window[t], buffer, bytes, t );

                int errors = 0;

                for (int i = 0; i < w; i++) 
                    if ( buffer[i] != (double)(t) ) errors++;

                if ( errors > 0 )
                    for (int i = 0; i < w; i++) 
                        printf("rank %d buffer[%d] = %lf \n", rank, i, buffer[i] );
            }

        PARMCIX_Barrier_comm(MPI_COMM_WORLD);

        if (rank != 0)
        {
           int errors = 0;

           for (int i = 0; i < w; i++) 
               if ( window[rank][i] != (double)(rank) ) errors++;

           if ( errors > 0 )
               for (int i = 0; i < w; i++) 
                   printf("rank %d window[%d][%d] = %lf \n", rank, rank, i, window[rank][i] );
        }

        PARMCIX_Barrier_comm(MPI_COMM_WORLD);

        if (rank == 0)
            for (int t=1; t<size; t++)
            {
                int bytes = w * sizeof(double);

                double t0, t1, t2, dt1, dt2, bw1, bw2;

                for (int i = 0; i < w; i++) buffer[i] = (double)(-1);

                t0 = MPI_Wtime();
                PARMCI_Put( buffer, window[t], bytes, t );
                t1 = MPI_Wtime();
                PARMCI_Fence( t );
                t2 = MPI_Wtime();

                dt1  = t1 - t0;
                dt2  = t2 - t0;
                bw1  = bytes / dt1;
                bw2  = bytes / dt2;
                bw1 /= 1000000.0;
                bw2 /= 1000000.0;
                printf("PARMCI_Put of from rank %4d to rank %4d of %9d bytes - local: %lf s (%lf MB/s) remote: %lf s (%lf MB/s) \n",
                       t, 0, bytes, dt1, bw1, dt2, bw2);
                fflush(stdout);
            }

        PARMCIX_Barrier_comm(MPI_COMM_WORLD);

        PARMCI_Free_local( (void *) buffer );

        PARMCIX_Free_comm(MPI_COMM_WORLD, (void *) window[rank] );
        PARMCI_Free_local( (void *) window );
    }

    PARMCI_Finalize();

    printf("%d: all done \n", rank );
    fflush(stdout);

    MPI_Finalize();

    return 0;
}
Пример #9
0
int   PARMCI_NbGet(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl)
{
    int rc;
    rc = PARMCI_Get(src, dst, bytes, proc);
    return 0;
}
Пример #10
0
int armci_pack_vector(int op, void *scale, armci_giov_t darr[],int len,
                      int proc,armci_ihdl_t nb_handle)
{
armci_giov_t extra; /* keeps data remainder of set to be processed in chunks */
armci_giov_t save;  /* keeps original value of set to be processed in chunks */
armci_giov_t *ndarr; /* points to first array element to be processed now */
int rc=0, nlen, count=0;

    armcip_init_giov_t(&extra);
    armcip_init_giov_t(&save);
    ndarr = darr;

    save.src_ptr_array=NULL; /* indicates that save slot is empty */
    while(len){

       armci_split_dscr_array(ndarr, len, &extra, &nlen, &save); 
#  if defined(REMOTE_OP) 
       /* A problem will occur if len is 1 and nlen is 0. This corresponds to a
        * situation where the size of an individual element is found to exceed
        * BUFSIZE1. Treat this as a single transfer of contiguous data using
        * the standard PARMCI_Get/Put/Acc call */
       if (len == 1 && nlen == 0) {
         if(ARMCI_ACC(op))rc=PARMCI_Acc(op, scale, ndarr[0].src_ptr_array[0],
            ndarr[0].dst_ptr_array[0],ndarr[0].bytes, proc);
         else if(op == GET)rc=PARMCI_Get(ndarr[0].src_ptr_array[0],
            ndarr[0].dst_ptr_array[0],ndarr[0].bytes, proc);
         else if(op == PUT)rc=PARMCI_Put(ndarr[0].src_ptr_array[0],
            ndarr[0].dst_ptr_array[0],ndarr[0].bytes, proc);
         else armci_die("Unknown op in armci_pack_vector",op);
         nlen = 1;
       } else {
         rc = armci_rem_vector(op, scale, ndarr,nlen,proc,0,nb_handle);
       }
#  else
       if(ARMCI_ACC(op))rc=armci_acc_vector(op,scale,ndarr,nlen,proc);
       else rc = armci_copy_vector(op,ndarr,nlen,proc);
#  endif
       if(rc) break;

       /* non-NULL pointer indicates that set was split */
       if(extra.src_ptr_array){

	 if(nb_handle) {
	   nb_handle->bufid = NB_MULTI; /*can be set multiple times here; but not reset here*/
	 }

          ndarr[nlen-1]=extra; /* set the pointer to remainder of last set */
          nlen--; /* since last set not done in full need to process it again */

       }else{

          if(save.src_ptr_array){
             ndarr[0]=save;
             save.src_ptr_array=NULL; /* indicates that save slot is empty */
          }

          if(nlen == 0)
            armci_die("vector packetization problem:buffer too small",BUFSIZE1);
       }

       len -=nlen;
       ndarr +=nlen;
       count ++;
    }

    return rc;
}