Example #1
0
/*
    portals developers' note:
    armci fence is not guaranteed to be correct unless PUT_START events are captured
    PUT_ENDs do NOT guarantee order; only PUT_STARTs
*/
void PARMCI_AllFence()
{
#if defined(CLUSTER)
     { int p; for(p=0;p<armci_nproc;p++)PARMCI_Fence(p); }
#endif
     MEM_FENCE;
}
Example #2
0
void ARMCI_Fence(int proc)
{
    if (!SAMECLUSNODE(proc))
    armci_profile_start(ARMCI_PROF_FENCE);
    PARMCI_Fence(proc);
    if (!SAMECLUSNODE(proc))
    armci_profile_stop(ARMCI_PROF_FENCE);
}
Example #3
0
void ARMCI_Fence(int proc)
{

    static double stime, etime;
    stime = TIME();
    PARMCI_Fence(proc);
    etime = TIME();
    ARMCI_Fence_t += etime - stime;
}
Example #4
0
/** One-sided copy of data from the source to the destination.  Set a flag on
  * the remote process when the transfer is complete.
  *
  * @param[in] src   Source buffer
  * @param[in] dst   Destination buffer on proc
  * @param[in] size  Number of bytes to transfer
  * @param[in] flag  Address of the flag buffer on proc
  * @param[in] value Value to set the flag to
  * @param[in] proc  Process id of the target
  * @return          0 on success, non-zero on failure
  */
int PARMCI_Put_flag(void *src, void* dst, int size, int *flag, int value, int proc) {
  /* TODO: This can be optimized with a more direct implementation, especially in the
   *       case where RMA is ordered; in that case, the Fence (Flush) is not necessary. */
  PARMCI_Put(src, dst, size, proc);
  PARMCI_Fence(proc);
  PARMCI_Put(&value, flag, sizeof(int), proc);

  return 0;
}
Example #5
0
/** Blocking operation that transfers data from the calling process to the
  * memory of the remote process.  The data transfer is strided and blocking.
  * After the transfer completes, the given flag is set on the remote process.
  *
  * @param[in] src_ptr         Source starting address of the data block to put.
  * @param[in] src_stride_arr  Source array of stride distances in bytes.
  * @param[in] dst_ptr         Destination starting address to put data.
  * @param[in] dst_stride_ar   Destination array of stride distances in bytes.
  * @param[in] count           Block size in each dimension. count[0] should be the
  *                            number of bytes of contiguous data in leading dimension.
  * @param[in] stride_levels   The level of strides.
  * @param[in] flag            Location of the flag buffer
  * @param[in] value           Value to set the flag to
  * @param[in] proc            Remote process ID (destination).
  *
  * @return                    Zero on success, error code otherwise.
  */
int PARMCI_PutS_flag(void *src_ptr, int src_stride_ar[/*stride_levels*/],
                 void *dst_ptr, int dst_stride_ar[/*stride_levels*/], 
                 int count[/*stride_levels+1*/], int stride_levels, 
                 int *flag, int value, int proc) {

  PARMCI_PutS(src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels, proc);
  PARMCI_Fence(proc);
  PARMCI_Put(&value, flag, sizeof(int), proc);

  return 1;
}
Example #6
0
void PARMCI_AllFence()
{
#if defined(ARMCIX)
    ARMCIX_AllFence ();
#elif defined(BGML)
    BGML_WaitAll();
#elif defined(LAPI) || defined(CLUSTER)
    int p;

    for(p = 0;p < armci_nproc; p++) {
        PARMCI_Fence(p); 
    }
#endif
    MEM_FENCE;
}
Example #7
0
File: gpc.c Project: bcernohous/ga
/*\ Send Request to Execute callback function in a global address space 
 *  Arguments:
 *  f     - handle to the callback function
 *  p     - remote processor
 *  hdr   - header data - used to pack extra args for callback (local buffer) 
 *  hlen  - size of header data < ARMCI_GPC_HLEN
 *  data  - bulk data passed to callback (local buffer)
 *  dlen  - length of bulk data
 *  rhdr  - ptr to reply header (return args from callback)
 *  rhlen - length of buffer to store reply header < ARMCI_GPC_HLEN  
 *  rdata - ptr to where reply data from callback should be stored (local buf)
 *  rdlen - size of the buffer to store reply data  
 *  nbh   - nonblocking handle
 *  
\*/
int ARMCI_Gpc_exec(int h, int p, void  *hdr, int hlen,  void *data,  int dlen,
		   void *rhdr, int rhlen, void *rdata, int rdlen, gpc_hdl_t* nbh)
{
  int hnd = -h + GPC_OFFSET;
  int err = 0;
  armci_hdl_t *ahdl = (nbh ? &(nbh->ahdl): NULL);

  if(hnd <0 || hnd>= GPC_SLOTS) 
    err += fprintf(stderr, "ARMCI_Gpc_exec: bad callback handle %d: %d\n",hnd,GPC_SLOTS);
  if(!_table[hnd]) 
    err += fprintf(stderr, "ARMCI_Gpc_exec: NULL function %d",hnd);

  if(hlen<0 || hlen>=ARMCI_Gpc_get_hlen())
    err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send header size %d %d\n", hlen, ARMCI_Gpc_get_hlen());
  if(rhlen<0 || rhlen>=ARMCI_Gpc_get_hlen())
    err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv header size %d %d\n", rhlen, ARMCI_Gpc_get_hlen());
  if(dlen<0 || dlen>=ARMCI_Gpc_get_dlen()) 
    err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send data size %d %d\n", dlen, ARMCI_Gpc_get_dlen());
  if(rdlen<0 || rdlen>=ARMCI_Gpc_get_dlen()) 
    err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv data size %d %d\n", rdlen, ARMCI_Gpc_get_dlen());

  if(hlen>0 && hdr==NULL) 
    err += fprintf(stderr, "ARMCI_Gpc_exec: Null send header for non-zero header size %d\n", hlen);
  if(rhlen>0 && rhdr==NULL) 
    err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv header for non-zero header size %d\n", rhlen);
  if(dlen>0 && data==NULL) 
    err += fprintf(stderr, "ARMCI_Gpc_exec: Null send data for non-zero data size %d\n", dlen);
  if(rdlen>0 && rdata==NULL) 
    err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv data for non-zero header size %d\n", rdlen);

  if(p<0 || p >= armci_nproc)
    err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid target processor id %d\n", p, armci_nproc);

  if(err)
    return FAIL;

  if(rhlen + rdlen == 0)
    armci_die("Zero reply header + data length not yet supported", 0);

  if(nbh)
    nbh->proc = p;
#if 1
  if(SAMECLUSNODE(p) && armci_nproc==1) {
    int rhsize, rdsize;
    int (*func)();

/*      fprintf(stderr, "%d:: armci gpc exec. SAMECLUSNODE\n", armci_me); */

    func = _table[hnd];
    if(func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize,
	    rdata, rdlen, &rdsize, GPC_INIT) != GPC_DONE) {
      func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize,
	   rdata, rdlen, &rdsize, GPC_WAIT);
    } 
#ifndef VAPI
    PARMCI_Fence(p);
#endif
    return 0;
  }
#endif

/*    fprintf(stderr, "%d:: armci gpc exec. invoking armci gpc\n", armci_me); */
  return armci_gpc(h, p, hdr, hlen,  data,  dlen,
		 rhdr, rhlen, rdata, rdlen, ahdl); 
}
Example #8
0
void ARMCI_Fence(int proc) {
  PARMCI_Fence(proc);
  return;
}
Example #9
0
int main(int argc, char *argv[])
{
    int rank, size;
    int provided;

#if defined(__bgp__)
    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    assert(provided==MPI_THREAD_MULTIPLE);
#else
    MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
    //assert(provided>MPI_THREAD_SINGLE);
#endif
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    assert( size > 1 );

    PARMCI_Init_args(&argc, &argv);

    int w, maxwinsize = ( argc > 1 ? atoi(argv[1]) : 1000000 );

    if ( rank == 0 ) printf( "size = %d maxwinsize = %d doubles\n", size, maxwinsize );

    for ( w = 1 ; w < maxwinsize ; w *= 2 )
    {
        double ** window;
        window  = (double **) PARMCI_Malloc_local( size * sizeof(double *) );
        PARMCIX_Malloc_comm(MPI_COMM_WORLD, (void **) window, w * sizeof(double) );
        for (int i = 0; i < w; i++) window[rank][i] = 0.0;

        double * buffer;
        buffer = (double *) PARMCI_Malloc_local(  w * sizeof(double) );

        PARMCIX_Barrier_comm(MPI_COMM_WORLD);

        if (rank == 0)
            for (int t=1; t<size; t+=2)
            {
                int bytes = w * sizeof(double);

                for (int i = 0; i < w; i++) buffer[i] = (double)(t);

                PARMCI_Put( buffer, window[t], bytes, t );
                PARMCI_Fence( t );

                for (int i = 0; i < w; i++) buffer[i] = 0.0;

                PARMCI_Get( window[t], buffer, bytes, t );

                int errors = 0;

                for (int i = 0; i < w; i++) 
                    if ( buffer[i] != (double)(t) ) errors++;

                if ( errors > 0 )
                    for (int i = 0; i < w; i++) 
                        printf("rank %d buffer[%d] = %lf \n", rank, i, buffer[i] );
            }

        PARMCIX_Barrier_comm(MPI_COMM_WORLD);

        if (rank != 0)
        {
           int errors = 0;

           for (int i = 0; i < w; i++) 
               if ( window[rank][i] != (double)(rank) ) errors++;

           if ( errors > 0 )
               for (int i = 0; i < w; i++) 
                   printf("rank %d window[%d][%d] = %lf \n", rank, rank, i, window[rank][i] );
        }

        PARMCIX_Barrier_comm(MPI_COMM_WORLD);

        if (rank == 0)
            for (int t=1; t<size; t++)
            {
                int bytes = w * sizeof(double);

                double t0, t1, t2, dt1, dt2, bw1, bw2;

                for (int i = 0; i < w; i++) buffer[i] = (double)(-1);

                t0 = MPI_Wtime();
                PARMCI_Put( buffer, window[t], bytes, t );
                t1 = MPI_Wtime();
                PARMCI_Fence( t );
                t2 = MPI_Wtime();

                dt1  = t1 - t0;
                dt2  = t2 - t0;
                bw1  = bytes / dt1;
                bw2  = bytes / dt2;
                bw1 /= 1000000.0;
                bw2 /= 1000000.0;
                printf("PARMCI_Put of from rank %4d to rank %4d of %9d bytes - local: %lf s (%lf MB/s) remote: %lf s (%lf MB/s) \n",
                       t, 0, bytes, dt1, bw1, dt2, bw2);
                fflush(stdout);
            }

        PARMCIX_Barrier_comm(MPI_COMM_WORLD);

        PARMCI_Free_local( (void *) buffer );

        PARMCIX_Free_comm(MPI_COMM_WORLD, (void *) window[rank] );
        PARMCI_Free_local( (void *) window );
    }

    PARMCI_Finalize();

    printf("%d: all done \n", rank );
    fflush(stdout);

    MPI_Finalize();

    return 0;
}
Example #10
0
int PARMCI_WaitProc(int proc)
{
    fprintf(stderr,"WARNING: PARMCI_WaitProc(int proc) only synchronizes implicit nonblocking operations! \n");
    PARMCI_Fence(proc);
    return(0);
}