Example #1
0
/** One-sided copy of data from the source to the destination.  Set a flag on
  * the remote process when the transfer is complete.
  *
  * @param[in] src   Source buffer
  * @param[in] dst   Destination buffer on proc
  * @param[in] size  Number of bytes to transfer
  * @param[in] flag  Address of the flag buffer on proc
  * @param[in] value Value to set the flag to
  * @param[in] proc  Process id of the target
  * @return          0 on success, non-zero on failure
  */
int ARMCI_Put_flag(void *src, void* dst, int size, int *flag, int value, int proc) {
  ARMCI_Put(src, dst, size, proc);
  ARMCI_Fence(proc);
  ARMCI_Put(&value, flag, sizeof(int), proc);

  return 0;
}
Example #2
0
int main(int argc, char ** argv) {
  int    rank, nproc, val, i;
  void **base_ptrs;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting ARMCI mutex read-modify-write test with %d processes\n", nproc);

  base_ptrs = malloc(nproc*sizeof(void*));

  ARMCI_Create_mutexes(rank == 0 ? 1 : 0);
  ARMCI_Malloc(base_ptrs, (rank == 0) ? sizeof(int) : 0); // Proc 0 has a shared int

  if (rank == 0) {
    val = 0;
    ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0);
  }

  ARMCI_Barrier();

  for (i = 0; i < NITER; i++) {
    ARMCI_Lock(0, 0);

    ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0);
    val += ADDIN;
    ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0);

    ARMCI_Unlock(0, 0);
  }

  printf(" + %3d done\n", rank);
  fflush(NULL);

  ARMCI_Barrier();

  if (rank == 0) {
    ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0);

    if (val == ADDIN*nproc*NITER)
      printf("Test complete: PASS.\n");
    else
      printf("Test complete: FAIL.  Got %d, expected %d.\n", val, ADDIN*nproc*NITER);
  }

  ARMCI_Free(base_ptrs[rank]);
  ARMCI_Destroy_mutexes();
  free(base_ptrs);

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
Example #3
0
void test_one_group(ARMCI_Group *group, int *pid_list) {
  int grp_me, grp_size;
  int i,j,src_proc,dst_proc;
  double *ddst_put[MAXPROC];
  double dsrc[ELEMS];
  int elems[2] = {MAXPROC,ELEMS};
  int value = -1, bytes, world_me;
  
  MP_MYID(&world_me);
  ARMCI_Group_rank(group, &grp_me);
  ARMCI_Group_size(group, &grp_size);
  if(grp_me==0) printf("GROUP SIZE = %d\n", grp_size);
  printf("%d:group rank = %d\n", me, grp_me);

  src_proc = 0; dst_proc = grp_size-1;
       
  bytes = ELEMS*sizeof(double);       
  ARMCI_Malloc_group((void **)ddst_put, bytes, group);
       
  for(i=0; i<ELEMS; i++) dsrc[i]=i*1.001*(grp_me+1); 
  for(i=0; i<ELEMS; i++) ddst_put[grp_me][i]=-1.0;
       
  armci_msg_group_barrier(group);
       
  if(grp_me==src_proc) {
    /* NOTE: make sure to specify absolute ids in ARMCI calls */
    ARMCI_Put(dsrc, &ddst_put[dst_proc][0], bytes,
	      ARMCI_Absolute_id(group,dst_proc));
  }
       
  armci_msg_group_barrier(group);
  /* NOTE: make sure to specify absolute ids in ARMCI calls */
  ARMCI_Fence(ARMCI_Absolute_id(group,dst_proc));
  sleep(1);
       
       
  /* Verify*/
  if(grp_me==dst_proc) {
    for(j=0; j<ELEMS; j++) {
      if(ARMCI_ABS(ddst_put[grp_me][j]-j*1.001*(src_proc+1)) > 0.1) {
	printf("\t%d: ddst_put[%d][%d] = %lf and expected value is %lf\n",
	       me, grp_me, j, ddst_put[grp_me][j], j*1.001*(src_proc+1));
	ARMCI_Error("groups: armci put failed...1", 0);
      }
    }
    printf("\n%d(%d): Test O.K. Verified\n", dst_proc, world_me);
  }
  armci_msg_group_barrier(group);
  ARMCI_Free_group(ddst_put[grp_me], group);
}
Example #4
0
int main(int argc, char * argv[]) {
  void *baseAddress[MAX_PROCESSORS];
  char *local;
  int thisImage;

  int iter = 100, size;
  double startTime, endTime;
  int i;

  // initialize
  ARMCI_Init();
  ARMCI_Myid(&thisImage);

  // allocate data (collective operation)
  ARMCI_Malloc(baseAddress, MAX_BUF_SIZE*sizeof(char));
  local = (char *)ARMCI_Malloc_local(MAX_BUF_SIZE*sizeof(char));

  ARMCI_Barrier();
  ARMCI_Migrate();

  if (thisImage == 0) {
    for(size = 1; size <= MAX_BUF_SIZE; size = size<<1){
      startTime = CkWallTimer();
      for(i = 0; i < iter; i++){
        ARMCI_Put(local, baseAddress[1], size, 1);
      }
      ARMCI_Fence(1);
      endTime = CkWallTimer();
      printf("%d: %f us\n", size, (endTime-startTime)*1000);
    }
    ARMCI_Barrier();
  } else if (thisImage == 1) {
    ARMCI_Barrier();
  }

  
  ARMCI_Free(baseAddress[thisImage]);
  ARMCI_Free_local(local);
  // finalize
  ARMCI_Finalize();
  return 0;
}
Example #5
0
void TRANSPOSE1D() {
    
    int dims[1];
    int nelem, i, ierr, min, max, cmin, cmax, lmin, lmax, pmin, pmax;    
    int src_offset, dst_offset, length;
    int *buf, *map;
    void *src_ptr, *dst_ptr;
    void **a_ptr, **b_ptr;
    int *a, *b;

    /* Find local processor ID and number of processors */
    int me, nprocs;
    me     = armci_msg_me();
    nprocs = armci_msg_nproc();

    /* Allocate pointers to data on all processors */
    a_ptr = (void**)malloc(nprocs*sizeof(int*));
    b_ptr = (void**)malloc(nprocs*sizeof(int*));
    map = (int*)malloc(nprocs*sizeof(int));

    /* Configure array dimensions. Force an unequal data distribution */
    dims[0]  = nprocs*TOTALELEMS + nprocs/2;
    if (me == 0) printf("Size of array: %d\n\n",dims[0]);
    /* Find first (zero-based) index of chunk owned by each processor and
       store it in map array */
    for (i=0; i<nprocs; i++) {
      map[i] = (int)(((double)i)*(((double)dims[0])/((double)nprocs)));
    }

    /* Figure out what size my portion of array is */
    if (me<nprocs-1) {
      nelem = map[me+1]-map[me];
    } else {
      nelem = dims[0]-map[me];
    }

    /* Allocate memory for array A */
    ierr = ARMCI_Malloc(a_ptr, nelem*sizeof(int));
    assert(ierr == 0);
    assert(a_ptr[me]);

    /* Allocate memory for array B */
    ierr = ARMCI_Malloc(b_ptr, nelem*sizeof(int));
    assert(ierr == 0);
    assert(b_ptr[me]);
    
    /* initialize data in array A and zero data in array B */
    a = (int*)a_ptr[me];
    b = (int*)b_ptr[me];
    for (i=0; i<nelem; i++) {
      a[i] = i + map[me] + 1;
      b[i] = 0;
    }

    /* Synchronize all processors to guarantee that everyone has data
       before proceeding to the next step. */
    armci_msg_barrier();

    /* Create local buffer for performing inversion */
    buf = (int*)malloc(nelem*sizeof(int));

    /* Copy inverted data into local buffer */
    a = (int*)a_ptr[me];
    for (i=0; i<nelem; i++) {
      buf[i] = a[nelem-i-1]; 
    }

    /* Find out which blocks of array B inverted block should be copied to.
       Start by finding min and max indices of data in array B*/
    min = dims[0] - (map[me] + nelem);
    max = dims[0] - map[me] - 1;

    /* Locate processors containing the endpoints */
    pmin = 0;
    for (i=0; i<nprocs; i++) {
      if (min >= map[i]) {
        pmin = i;
      } else {
        break;
      }
    }
    pmax = nprocs-1;
    for (i=nprocs-2; i>=0; i--) {
      if (max < map[i+1]) {
        pmax = i;
      } else {
        break;
      }
    }

    /* Loop over processors that will receive data and copy inverted data to
       processors */
    for (i=pmin; i<=pmax; i++) {
      /* Find min and max indices owned by processor i */
      lmin = map[i];
      if (i<nprocs-1) {
        lmax = map[i+1]-1;
      } else {
        lmax = dims[0]-1;
      }

      /* Find min and max indices that should be sent to processor i */
      if (lmin > min) {
        cmin = lmin;
      } else {
        cmin = min;
      }
      if (lmax < max) {
        cmax = lmax;
      } else {
        cmax = max;
      }

      /* Find offsets on source and destination processors */
      src_offset = cmin - min;
      src_ptr = (void*)(buf + src_offset);
      dst_offset = cmin - lmin;
      dst_ptr = ((char*)b_ptr[i]) + sizeof(int)*dst_offset;
      
      /* Find length of data (in bytes) to be sent to processor i */
      length = sizeof(int)*(cmax-cmin+1);

      /* Send data to processor */
      ARMCI_Put(src_ptr, dst_ptr, length, i);
    }
    ARMCI_AllFence();
    armci_msg_barrier();
    
    free(buf);

    VERIFY(b_ptr, dims, map);

    free(map);
    armci_msg_barrier();
    ARMCI_Free(a_ptr[me]);
    ARMCI_Free(b_ptr[me]);
    free(a_ptr);
    free(b_ptr);
}
Example #6
0
double time_put(double *src_buf, double *dst_buf, int chunk, int loop,
                int proc, int levels)
{
    int i, bal = 0;

    int stride[2];
    int count[2];
    int stride_levels = levels;
    double *tmp_buf;

    double start_time, stop_time, total_time = 0;

    stride[0] = SIZE * sizeof(double);
    count[0] = chunk * sizeof(double); count[1] = chunk;

    if(CHECK_RESULT) {
        tmp_buf = (double *)malloc(SIZE * SIZE * sizeof(double));
        assert(tmp_buf != NULL);
    }
    
    start_time = TIMER();
    for(i=0; i<loop; i++) {

#ifdef FORCE_1D
        int j;
        if(levels>0)for(j=0; j< count[1]; j++){
           char *s = (char*) src_buf, *d= (char*)dst_buf;
           s += j*stride[0]; d += j*stride[0];
           ARMCI_Put(src_buf, dst_buf, count[0],proc);
        }
        else
#endif
        if(levels)
           ARMCI_PutS(src_buf, stride, dst_buf, stride, count, stride_levels,proc);
        else
           ARMCI_Put(src_buf, dst_buf,count[0], proc);

        if(CHECK_RESULT) {
            ARMCI_GetS(dst_buf, stride, tmp_buf, stride, count,
                       stride_levels, proc);

            sprintf(check_type, "ARMCI_PutS:");
            check_result(tmp_buf, src_buf, stride, count, stride_levels);
        }
        
        /* prepare next src and dst ptrs: avoid cache locality */
        if(bal == 0) {
            src_buf += 128;
            dst_buf += 128;
            bal = 1;
        } else {
            src_buf -= 128;
            dst_buf -= 128;
            bal = 0;
        }
    }
    stop_time = TIMER();
    total_time = (stop_time - start_time);

    if(CHECK_RESULT) free(tmp_buf);
    
    if(total_time == 0.0){ 
       total_time=0.000001; /* workaround for inaccurate timers */
       warn_accuracy++;
    }
    return(total_time/loop);
}
static int sparse_initialize(int *n, int *non_zero, int **row_ind,
                             int **col_ind, double **values, double **vec,
                             double **svec) {

    int i, j, rc, max, *row_ind_tmp=NULL, *tmp_indices=NULL;
    double *tmp_values=NULL;
    unsigned long len;
    FILE *fp=NULL;

    /* Broadcast order of matrix */
    if(me==0) {
        if((fp=fopen("Sparse-MPI/av41092.rua.data", "r")) == NULL)
            ARMCI_Error("Error: Input file not found", me);
        fortran_indexing = 1; /* This is 1 for Harwell-Boeing format matrices */
        fscanf(fp, "%d", n);
        if(*n%nproc)
            ARMCI_Error("# of rows is not divisible by # of processors", nproc);
        if(*n > ROW)
            ARMCI_Error("order is greater than defined variable ROW", ROW);
    }
    len = sizeof(int);
    armci_msg_brdcst(n, len, 0);

    /* Broad cast number of non_zeros */
    if(me==0) fscanf(fp, "%d", non_zero);
    armci_msg_brdcst(non_zero, len, 0);

    /* Broadcast row indices */
    len = (*n+1)*sizeof(int);
    row_ind_tmp = (int *)malloc(len);
    if(me==0)for(i=0; i<*n+1; i++) {
            fscanf(fp, "%d", &row_ind_tmp[i]);
            if(fortran_indexing) --row_ind_tmp[i];
        }
    armci_msg_brdcst(row_ind_tmp, len, 0);

    load_balance(*n, *non_zero, row_ind_tmp);

    /* find how much temporary storage is needed at the maximum */
    if(me==0) {
        for(max=-1,j=0; j<nproc; j++) if(max<proc_nz_list[j]) max=proc_nz_list[j];
        if(max<0) ARMCI_Error(" max cannot be negative", max);
    }

    /* Broadcast the maximum number of elements */
    len = sizeof(int);
    armci_msg_brdcst(&max, len, 0);

    /* create the Sparse MAtrix Array */
    if(me==0) printf("  Creating ValueArray (CompressedSparseMatrix) ...\n\n");
    create_array((void**)col_ind, sizeof(int), 1, &max);

    /* create the column subscript array */
    if(me==0) printf("  Creating Column Subscript Array ... \n\n");
    create_array((void**)values, sizeof(double), 1, &max);

    /* create the x-vector and the solution vector */
    if(me==0) printf("  Creating Vectors ... \n\n");
    create_array((void**)vec,  sizeof(double),1, &max);
    create_array((void**)svec, sizeof(double),1, &max);
    armci_msg_barrier();


    /* Process 0 distributes the column indices and non_zero values to
       respective processors*/
    if(me == 0) {
        tmp_indices = (int *)malloc(max*sizeof(int));
        tmp_values  = (double *)malloc(max*sizeof(double));

        for(j=0; j<nproc; j++) {
            for(i=0; i<proc_nz_list[j]; i++) {
                fscanf(fp, "%d", &tmp_indices[i]);
                if(fortran_indexing) --tmp_indices[i];
            }
            /* rc = fread(tmp_indices, sizeof(int), proc_nz_list[j], fp); */
            if((rc=ARMCI_Put(tmp_indices, col_ind[j], proc_nz_list[j]*sizeof(int), j)))
                ARMCI_Error("armci_nbput failed\n",rc);
        }
        for(j=0; j<nproc; j++) {
            for(i=0; i<proc_nz_list[j]; i++) fscanf(fp, "%lf", &tmp_values[i]);
            if((rc=ARMCI_Put(tmp_values, values[j], proc_nz_list[j]*sizeof(double), j)))
                ARMCI_Error("armci_nbput failed\n",rc);
        }
    }
    ARMCI_AllFence();
    armci_msg_barrier();
    ARMCI_AllFence();

    /* initializing x-vector */
    if(me==0) for(i=0; i<proc_nz_list[me]; i++) vec[me][i] = (i+1);
    else for(i=0; i<proc_nz_list[me]; i++) vec[me][i]=me*proc_nz_list[me-1]+(i+1);

#if 0
    if(me==0) {
        printf("max = %d\n", max);
        for(i=0; i<max; i++)  printf("%.1f ", values[me][i]);
        printf("\n");
    }
#endif

    *row_ind = row_ind_tmp;
    if(me==0) {
        free(tmp_indices);
        free(tmp_values);
        fclose(fp);
    }
    return 0;
}
Example #8
0
/** Non-blocking put operation.  Note: the implementation is not non-blocking
  */
int ARMCI_NbPut(void *src, void *dst, int bytes, int proc, armci_hdl_t *handle) {
  return ARMCI_Put(src, dst, bytes, proc);
}
Example #9
0
static void contig_test(size_t buffer_size, int op)
{
    void **dst_ptr;
    void **put_buf;
    void **get_buf;
    double *times;

    dst_ptr = (void*)malloc(nproc * sizeof(void*));
    put_buf = (void*)malloc(nproc * sizeof(void*));
    get_buf = (void*)malloc(nproc * sizeof(void*));
    times = (double*)malloc(nproc * sizeof(double));
    ARMCI_Malloc(dst_ptr, buffer_size);
    ARMCI_Malloc(put_buf, buffer_size);
    ARMCI_Malloc(get_buf, buffer_size);

    /* initialize what we're putting */
    fill_array((double*)put_buf[me], buffer_size/sizeof(double), me);

    size_t msg_size;

    int dst = 1;
    double scale = 1.0;
    for (msg_size = 16; msg_size <= buffer_size; msg_size *= 2) {

        int j;
        int iter = msg_size > MEDIUM_MESSAGE_SIZE ? ITER_LARGE : ITER_SMALL;

        double t_start, t_end;
        if (0 == me) {
            for (j= 0; j < iter + WARMUP; ++j) {

                if (WARMUP == j) {
                    t_start = dclock();
                }

                switch (op) {
                    case PUT:
                        ARMCI_Put(put_buf[me], dst_ptr[dst], msg_size,
                                dst);
                        break;
                    case GET:
                        ARMCI_Get(dst_ptr[dst], get_buf[me], msg_size,
                                dst);
                        break;
                    case ACC:
                        ARMCI_Acc(ARMCI_ACC_DBL, &scale, 
                                put_buf[me], dst_ptr[dst], msg_size,
                                dst);
                        break;
                    default:
                        ARMCI_Error("oops", 1);
                }

            }
        }
        /* calculate total time and average time */
        t_end = dclock();
        ARMCI_Barrier();


        if (0 == me) {
            printf("%8zu\t\t%6.2f\t\t%10.2f\n",
                    msg_size,
                    ((t_end  - t_start))/iter,
                    msg_size*iter/((t_end - t_start)));
        }
    }
    ARMCI_Free(dst_ptr[me]);
    ARMCI_Free(put_buf[me]);
    ARMCI_Free(get_buf[me]);
    free(dst_ptr);
    free(put_buf);
    free(get_buf);
    free(times);
}
Example #10
0
void test_perf_nb(int dry_run) {
  
    int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
    int stride, k=0, ntimes;
    double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9;
    double *dsrc[MAXPROC], scale=1.0;
    armci_hdl_t hdl_get, hdl_put, hdl_acc;
        
    create_array((void**)ddst, sizeof(double),2, elems);
    create_array((void**)dsrc, sizeof(double),1, &elems[1]);

    if(!dry_run)if(me == 0) {
      printf("\n\t\t\tRemote 1-D Array Section\n");
      printf("section    get      nbget    wait     put     nbput  ");
      printf("   wait     acc     nbacc     wait\n");
      printf("-------  -------- -------- -------- -------- --------");
      printf(" -------- -------- -------- --------\n");
      fflush(stdout);
    }

    for(loop=1; loop<=MAXELEMS; loop*=2, k++) {

      elems[1] = loop;
      ntimes = (int)sqrt((double)(MAXELEMS/elems[1]));
      if(ntimes <1) ntimes=1;

      /* -------------------------- SETUP --------------------------- */
      /*initializing non-blocking handles,time,src & dst buffers*/
      ARMCI_INIT_HANDLE(&hdl_put);
      ARMCI_INIT_HANDLE(&hdl_get);
      ARMCI_INIT_HANDLE(&hdl_acc);
      t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0;
      for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;    
      MP_BARRIER();
      
      /* bytes transfered */
      bytes = sizeof(double)*elems[1]; 
      MP_BARRIER();
      
      /* -------------------------- PUT/GET -------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	  t1 += MP_TIMER()-stime;
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();
      
      if(me == 0) { 
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();    
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	  t4 += MP_TIMER()-stime;	
	}
      }    
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      /* ------------------------ nb PUT/GET ------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,
			       i, &hdl_put)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	    t2 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_put);
	    t3 += MP_TIMER()-stime;
	  } 
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,
			       i, &hdl_get)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	    t5 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_get);
	    t6 += MP_TIMER()-stime;
	  }
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; 
      MP_BARRIER();


      /* ------------------------ Accumulate ------------------------- */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();
	if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			  &ddst[0][0], &stride, &bytes, 0, 0)))
	  ARMCI_Error("armci_acc failed\n",rc);
	t7 += MP_TIMER()-stime;
	
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }

#if 1
      /* See the note below why this part is disabled */
      /* ---------------------- nb-Accumulate ------------------------ */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();    
	if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			    &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc)))
	  ARMCI_Error("armci_nbacc failed\n",rc);
	t8 += MP_TIMER()-stime; stime=MP_TIMER();
	ARMCI_Wait(&hdl_acc);
	t9 += MP_TIMER()-stime;
      
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }
#endif

      /* print timings */
     if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", 
		       bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, 
		       t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes);
    }

    ARMCI_AllFence();
    MP_BARRIER();
    
    if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);}
    destroy_array((void **)ddst);
    destroy_array((void **)dsrc);
}
int main(int argc, char ** argv) {
  int    rank, nproc, i, test_iter;
  int   *my_data, *buf;
  void **base_ptrs;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting ARMCI test with %d processes\n", nproc);

  buf = malloc(DATA_SZ);
  base_ptrs = malloc(sizeof(void*)*nproc);

  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
    if (rank == 0) printf(" + iteration %d\n", test_iter);

    /*** Allocate the shared array ***/
    ARMCI_Malloc(base_ptrs, DATA_SZ);
    my_data = base_ptrs[rank];

    /*** Get from our right neighbor and verify correct data ***/
    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank*test_iter;
    ARMCI_Access_end(my_data);

    ARMCI_Barrier(); // Wait for all updates to data to complete

    ARMCI_Get(base_ptrs[(rank+1) % nproc], buf, DATA_SZ, (rank+1) % nproc);

    for (i = 0; i < DATA_NELTS; i++) {
      if (buf[i] != ((rank+1) % nproc)*test_iter) {
        printf("%d: GET expected %d, got %d\n", rank, (rank+1) % nproc, buf[i]);
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }

    ARMCI_Barrier(); // Wait for all gets to complete

    /*** Put to our left neighbor and verify correct data ***/
    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank*test_iter;
    ARMCI_Put(buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);

    ARMCI_Barrier(); // Wait for all updates to data to complete

    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) {
      if (my_data[i] != ((rank+1) % nproc)*test_iter) {
        printf("%d: PUT expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    ARMCI_Access_end(my_data);

    ARMCI_Barrier(); // Wait for all gets to complete

    /*** Accumulate to our left neighbor and verify correct data ***/
    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank;
    
    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank;
    ARMCI_Access_end(my_data);
    ARMCI_Barrier();

    int scale = test_iter;
    ARMCI_Acc(ARMCI_ACC_INT, &scale, buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);

    ARMCI_Barrier(); // Wait for all updates to data to complete

    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) {
      if (my_data[i] != rank + ((rank+1) % nproc)*test_iter) {
        printf("%d: ACC expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
        //MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    ARMCI_Access_end(my_data);

    ARMCI_Free(my_data);
  }

  free(buf);
  free(base_ptrs);

  if (rank == 0) printf("Test complete: PASS.\n");

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}