Example #1
0
int main(int argc, char ** argv) {
  int    rank, nproc, val, i;
  void **base_ptrs;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting ARMCI mutex read-modify-write test with %d processes\n", nproc);

  base_ptrs = malloc(nproc*sizeof(void*));

  ARMCI_Create_mutexes(rank == 0 ? 1 : 0);
  ARMCI_Malloc(base_ptrs, (rank == 0) ? sizeof(int) : 0); // Proc 0 has a shared int

  if (rank == 0) {
    val = 0;
    ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0);
  }

  ARMCI_Barrier();

  for (i = 0; i < NITER; i++) {
    ARMCI_Lock(0, 0);

    ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0);
    val += ADDIN;
    ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0);

    ARMCI_Unlock(0, 0);
  }

  printf(" + %3d done\n", rank);
  fflush(NULL);

  ARMCI_Barrier();

  if (rank == 0) {
    ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0);

    if (val == ADDIN*nproc*NITER)
      printf("Test complete: PASS.\n");
    else
      printf("Test complete: FAIL.  Got %d, expected %d.\n", val, ADDIN*nproc*NITER);
  }

  ARMCI_Free(base_ptrs[rank]);
  ARMCI_Destroy_mutexes();
  free(base_ptrs);

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
Example #2
0
int main(int argc, char **argv)
{
int k,i;
double **myptrs[10];
double t0,t1,tget=0,tnbget=0,tput=0,tnbput=0,tnbwait=0,t2=0;
#if PORTALS
    ARMCI_NetInit();
#endif
    MPI_Init(&argc,&argv);
    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
    ARMCI_Init();
    ARMCI_Init();
    for(k=0;k<10;k++){
      myptrs[k] = (double **)malloc(sizeof(double *)*nprocs);
      ARMCI_Malloc((void **)myptrs[k],400000*LOOP*sizeof(double)); 
      for(i=0;i<LOOP;i++)myptrs[k][me][i]=me+0.414;
      MPI_Barrier(MPI_COMM_WORLD);
      for(i=0;i<LOOP;i++){
        ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs);
        /*if(myptrs[k][me][i]!=0.414+(me+1)%nprocs)ARMCI_Error("errr",myptrs[k][me][i]);*/
      }
      t0=t1=tget=tnbget=tput=tnbput=tnbwait=t2=0;
      t0 = MPI_Wtime(); 
      for(i=0;i<LOOP;i++){
        ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs);
      }
      t1 = MPI_Wtime(); 
      printf("\nGet Latency=%lf\n",1e6*(t1-t0)/LOOP);fflush(stdout);
      t1=t0=0;
      for(i=0;i<LOOP;i++){
        armci_hdl_t nbh;
        ARMCI_INIT_HANDLE(&nbh);
        t0 = MPI_Wtime(); 
        ARMCI_NbGet(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs,&nbh);
        t1 = MPI_Wtime(); 
        ARMCI_Wait(&nbh);
        t2 = MPI_Wtime();
        tnbget+=(t1-t0);
        tnbwait+=(t2-t1);
      }
      printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout);
      MPI_Barrier(MPI_COMM_WORLD);
    }
    for(k=0;k<10;k++)ARMCI_Free(myptrs[k][me]);
    MPI_Barrier(MPI_COMM_WORLD);
    ARMCI_Finalize();
    ARMCI_Finalize();
    MPI_Finalize();
    
}
Example #3
0
void test_pairs(int th_idx)
{
    int rem_th, rem_proc;
    int i, j;
    void *src, *dst;

    rem_th = pairs[TH_ME];
    rem_proc = TH2PROC(rem_th);

    prndbg(th_idx, "test_pair: %d<->%d(%d)\n", TH_ME, rem_th, rem_proc);

    MT_BARRIER();
#if 0
    print_array(th_idx, "before", &AELEM(ptrs2[TH_ME],rem_th,0,0), ASIZExITERS);
#endif
    for (i = 0; i < iters; i++) {
        /* src - addr of my thread block on remote proc/thread */
        src = &AELEM(ptrs1[rem_th],TH_ME,i,0);
        /* src - addr of remote thread block on my proc/thread */
        dst = &AELEM(ptrs2[TH_ME],rem_th,i,0);
        /* get from my pair */
        assert(!ARMCI_Get(src, dst, ASIZE_BYTES, rem_proc));
    }

    MT_BARRIER();
#if 0
    print_array(th_idx, "rcvd", &AELEM(ptrs2[TH_ME],rem_th,0,0), ASIZExITERS);
#endif
    /* check results */
    check_result(&AELEM(ptrs2[TH_ME],rem_th,0,0), rem_th);

}
Example #4
0
void get_remote(double *buf, int I, int J)
{
    int proc_owner;
    int edge, size;
    
    proc_owner = block_owner(I, J);
    
    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }

    if ((I == nblocks-1) && (J == nblocks-1)) {
        size = edge*edge;
    }
    else if ((I == nblocks-1) || (J == nblocks-1)) {
        size = edge*block_size;
    }
    else {
        size = block_size*block_size;
    }
    size = size * sizeof(double);
    
    ARMCI_Get(a[I+J*nblocks], buf, size, proc_owner);
}
Example #5
0
void get_remote(double *buf, int I, int J)
{
    int proc_owner;
    int edge, size;

#ifdef USE_MUTEX
    THREAD_LOCK(mutex);
#endif

    proc_owner = block_owner(I, J) / th_per_p;

    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }

    if ((I == nblocks-1) && (J == nblocks-1)) {
        size = edge*edge;
    }
    else if ((I == nblocks-1) || (J == nblocks-1)) {
        size = edge*block_size;
    }
    else {
        size = block_size*block_size;
    }
    size = size * sizeof(double);

    if (proc_owner == me) memcpy(buf, a[I+J*nblocks], size);
    else ARMCI_Get(a[I+J*nblocks], buf, size, proc_owner);

#ifdef USE_MUTEX
    THREAD_UNLOCK(mutex);
#endif
}
static void gather_solution_vector(double **svec) {
#if 0
    double y[COL];
    if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start],
                     bytes, i)))
        ARMCI_Error("armci_nbget failed\n",rc);
#endif
}
Example #7
0
int main(int argc, char ** argv) {
  MPI_Init(&argc, &argv);
  ARMCI_Init();

  ARMCI_Get(NULL, NULL, 1, 0);

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
Example #8
0
void VERIFY(void **b_ptr, int *dims, int *map) {
    int i, j, length, icnt, ichk, lmin, lmax;
    int *buf, *b;
    void *src_ptr, *dst_ptr;
    int me, nprocs;

    /* Find local processor ID and number of processors */
    me     = armci_msg_me();
    nprocs = armci_msg_nproc();

    /* Process 0 verifies that inversion is correct. Start by allocating
       buffer and guarantee that it is big enough */

    length = (int)(((double)dims[0])/((double)nprocs)) + 1;
    buf = (int*)malloc(length*sizeof(int));
    if (me == 0) {
      icnt = 0;
      ichk = 0;
      for (i=0; i<nprocs; i++) {
        /* Find min and max indices owned by processor i */
        lmin = map[i];
        if (i<nprocs-1) {
          lmax = map[i+1]-1;
        } else {
          lmax = dims[0]-1;
        }
        /* evaluate parameters for get call */
        length = sizeof(int)*(lmax-lmin+1);
        src_ptr = b_ptr[i];
        dst_ptr = (void*)buf;
        ARMCI_Get(src_ptr, dst_ptr, length, i);

        /* check values in buffer */
        length = lmax-lmin+1;
        b = (int*)dst_ptr;
        for (j=0; j<length; j++) {
          /* printf("p[%d] b[%d]: %d\n",me,icnt,b[j]); */
          if (b[j] != dims[0] - icnt) {
            printf("Error found for element %d b: %d != a: %d\n",
                icnt,b[j],dims[0]-icnt);
            ichk = 1;
          }
          icnt++;
        }
      }
      if (ichk == 0) {
        printf("1D transpose successful. No errors found\n");
      } else {
        printf("1D transpose failed\n");
      }
    }
    free(buf);
}
static void get_data(int n, int start, int end, double *vec_local,
                     double **vec) {
    int i, j, rc, bytes, offset;
    int proc_start, proc_end, idx_start, idx_end;

    proc_start = proc_end = -1;
    for(i=0; i<nproc; i++) {
        if(proc_start<0 && proc_row_list[i]>start) proc_start = i;
        if(proc_end<0 && proc_row_list[i]>end) proc_end = i;
    }
    if(proc_start<0 || proc_end<0) ARMCI_Error("Invalid Process Ids", -1);

    for(i=proc_start; i<=proc_end; i++) {
        if(i==proc_start) idx_start = start;
        else {
            if(i==0) idx_start=0;
            else idx_start = proc_row_list[i-1];
        }
        if(i==proc_end) idx_end = end;
        else idx_end = proc_row_list[i]-1;

        if(i!=prev_proc) {
            ++count;
            prev_proc = i;
            ARMCI_INIT_HANDLE(&gHandle[count]);
            ARMCI_SET_AGGREGATE_HANDLE(&gHandle[count]);
        }

        if(i==0) offset=0;
        else offset = proc_row_list[i-1];
        if(i==me) { /* local */
            for(j=idx_start; j<=idx_end; j++) vec_local[j] = vec[me][j-offset];
        }
        else {     /* remote */
            bytes = (idx_end-idx_start+1)*sizeof(double);
            vec_local[idx_start] = -1;
#if 0
            if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start],
                             bytes, i)))
#else
            if((rc=ARMCI_NbGet(&vec[i][idx_start-offset], &vec_local[idx_start],
                               bytes, i, &gHandle[count])))
#endif
                ARMCI_Error("armci_nbget failed\n",rc);
        }
    }
}
Example #10
0
void get_remote(double *buf, int I, int J)
{
    int proc_owner;
    int edge, size;
    double t1;

    proc_owner = block_owner(I, J);

    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }

    if ((I == nblocks-1) && (J == nblocks-1)) {
        size = edge*edge;
    }
    else if ((I == nblocks-1) || (J == nblocks-1)) {
        size = edge*block_size;
    }
    else {
        size = block_size*block_size;
    }
    size = size * sizeof(double);

    t1 = MPI_Wtime();
#ifdef MPI2_ONESIDED
    {
        int target_disp = ( ((char*)(a[I+J*nblocks])) -
                            ((char*)(ptr[proc_owner])) );
        if(target_disp<0) {
            printf("ERROR!: target disp is < 0, target_disp= %d\n", target_disp);
            MPI_Abort(MPI_COMM_WORLD, 1);
        }
        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc_owner, 0, win);
        MPI_Get(buf, size, MPI_CHAR, proc_owner, target_disp, size,
                MPI_CHAR, win);
        MPI_Win_unlock(proc_owner, win);
    }
#else
    ARMCI_Get(a[I+J*nblocks], buf, size, proc_owner);
#endif
    comm_time += MPI_Wtime() - t1;
    get_cntr++;
}
int main(int argc, char ** argv) {
  int    rank, nproc, i, test_iter;
  int   *my_data, *buf;
  void **base_ptrs;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting ARMCI test with %d processes\n", nproc);

  buf = malloc(DATA_SZ);
  base_ptrs = malloc(sizeof(void*)*nproc);

  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
    if (rank == 0) printf(" + iteration %d\n", test_iter);

    /*** Allocate the shared array ***/
    ARMCI_Malloc(base_ptrs, DATA_SZ);
    my_data = base_ptrs[rank];

    /*** Get from our right neighbor and verify correct data ***/
    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank*test_iter;
    ARMCI_Access_end(my_data);

    ARMCI_Barrier(); // Wait for all updates to data to complete

    ARMCI_Get(base_ptrs[(rank+1) % nproc], buf, DATA_SZ, (rank+1) % nproc);

    for (i = 0; i < DATA_NELTS; i++) {
      if (buf[i] != ((rank+1) % nproc)*test_iter) {
        printf("%d: GET expected %d, got %d\n", rank, (rank+1) % nproc, buf[i]);
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }

    ARMCI_Barrier(); // Wait for all gets to complete

    /*** Put to our left neighbor and verify correct data ***/
    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank*test_iter;
    ARMCI_Put(buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);

    ARMCI_Barrier(); // Wait for all updates to data to complete

    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) {
      if (my_data[i] != ((rank+1) % nproc)*test_iter) {
        printf("%d: PUT expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    ARMCI_Access_end(my_data);

    ARMCI_Barrier(); // Wait for all gets to complete

    /*** Accumulate to our left neighbor and verify correct data ***/
    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank;
    
    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank;
    ARMCI_Access_end(my_data);
    ARMCI_Barrier();

    int scale = test_iter;
    ARMCI_Acc(ARMCI_ACC_INT, &scale, buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);

    ARMCI_Barrier(); // Wait for all updates to data to complete

    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) {
      if (my_data[i] != rank + ((rank+1) % nproc)*test_iter) {
        printf("%d: ACC expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
        //MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    ARMCI_Access_end(my_data);

    ARMCI_Free(my_data);
  }

  free(buf);
  free(base_ptrs);

  if (rank == 0) printf("Test complete: PASS.\n");

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
Example #12
0
int main(int argc, char **argv) {
  int i;
  double **myptrs;
  double t0, t1, tnbget=0, tnbwait=0, t2=0;

  MP_INIT(argc,argv);
  ARMCI_Init();

  MP_PROCS(&nprocs);
  MP_MYID(&me);

  if (nprocs < 2)
    ARMCI_Error("This program requires at least to processes", 1);

  myptrs = (double **)malloc(sizeof(double *)*nprocs);
  ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); 
  
  MP_BARRIER();
  
  if(me == 0) {
    for(i = 0; i < 10; i++) {
      // This is a bug:
      // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1);
      ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1);
    }

    t0 = MP_TIMER(); 
    for(i = 0; i < LOOP; i++) {
      // This is a bug:
      // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1);
      ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1);
    }
    t1 = MP_TIMER(); 

    printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP);
    fflush(stdout);

    t1 = t0 = 0;

    for(i = 0; i < LOOP; i++) {
      armci_hdl_t nbh;
      ARMCI_INIT_HANDLE(&nbh);

      t0 = MP_TIMER(); 
      //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh);
      ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh);
      t1 = MP_TIMER(); 
      ARMCI_Wait(&nbh);
      t2 = MP_TIMER();

      tnbget  += (t1-t0);
      tnbwait += (t2-t1);
    }

    printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout);
  }

  else
    sleep(1);

  MP_BARRIER();

  ARMCI_Finalize();
  MP_FINALIZE();

  return 0;
}
Example #13
0
double time_get(double *src_buf, double *dst_buf, int chunk, int loop,
                int proc, int levels)
{
    int i, bal = 0;
    
    int stride[2];
    int count[2];
    int stride_levels = levels;
    double *tmp_buf, *tmp_buf_ptr;
    
    double start_time, stop_time, total_time = 0;

    stride[0] = SIZE * sizeof(double);
    count[0] = chunk * sizeof(double); count[1] = chunk;

    if(CHECK_RESULT) {
        tmp_buf = (double *)malloc(SIZE * SIZE * sizeof(double));
        assert(tmp_buf != NULL);

        fill_array(tmp_buf, SIZE*SIZE, proc);
        tmp_buf_ptr = tmp_buf;
    }
    
    start_time = TIMER();
    for(i=0; i<loop; i++) {
         
#ifdef FORCE_1D
        int j;
        if(levels>0)for(j=0; j< count[1]; j++){
           char *s = (char*) src_buf, *d= (char*)dst_buf;
           s += j*stride[0]; d += j*stride[0];
           ARMCI_Get(src_buf, dst_buf, count[0],proc);
        }
        else
#endif
        if(levels)
           ARMCI_GetS(src_buf, stride, dst_buf, stride, count, stride_levels,proc);
        else
           ARMCI_Get(src_buf, dst_buf,count[0], proc);

        if(CHECK_RESULT) {
            sprintf(check_type, "ARMCI_GetS:");
            check_result(tmp_buf_ptr, dst_buf, stride, count, stride_levels);
        }
        
        /* prepare next src and dst ptrs: avoid cache locality */
        if(bal == 0) {
            src_buf += 128;
            dst_buf += 128;
            if(CHECK_RESULT) tmp_buf_ptr += 128;
            bal = 1;
        } else {
            src_buf -= 128;
            dst_buf -= 128;
            if(CHECK_RESULT) tmp_buf_ptr -= 128;
            bal = 0;
        }
    }
    stop_time = TIMER();
    total_time = (stop_time - start_time);

    if(CHECK_RESULT) free(tmp_buf);

    if(total_time == 0.0){
       total_time=0.000001; /* workaround for inaccurate timers */
       warn_accuracy++;
    }
    return(total_time/loop);
}
Example #14
0
void test_perf_nb(int dry_run) {
  
    int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
    int stride, k=0, ntimes;
    double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9;
    double *dsrc[MAXPROC], scale=1.0;
    armci_hdl_t hdl_get, hdl_put, hdl_acc;
        
    create_array((void**)ddst, sizeof(double),2, elems);
    create_array((void**)dsrc, sizeof(double),1, &elems[1]);

    if(!dry_run)if(me == 0) {
      printf("\n\t\t\tRemote 1-D Array Section\n");
      printf("section    get      nbget    wait     put     nbput  ");
      printf("   wait     acc     nbacc     wait\n");
      printf("-------  -------- -------- -------- -------- --------");
      printf(" -------- -------- -------- --------\n");
      fflush(stdout);
    }

    for(loop=1; loop<=MAXELEMS; loop*=2, k++) {

      elems[1] = loop;
      ntimes = (int)sqrt((double)(MAXELEMS/elems[1]));
      if(ntimes <1) ntimes=1;

      /* -------------------------- SETUP --------------------------- */
      /*initializing non-blocking handles,time,src & dst buffers*/
      ARMCI_INIT_HANDLE(&hdl_put);
      ARMCI_INIT_HANDLE(&hdl_get);
      ARMCI_INIT_HANDLE(&hdl_acc);
      t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0;
      for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;    
      MP_BARRIER();
      
      /* bytes transfered */
      bytes = sizeof(double)*elems[1]; 
      MP_BARRIER();
      
      /* -------------------------- PUT/GET -------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	  t1 += MP_TIMER()-stime;
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();
      
      if(me == 0) { 
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();    
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	  t4 += MP_TIMER()-stime;	
	}
      }    
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      /* ------------------------ nb PUT/GET ------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,
			       i, &hdl_put)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	    t2 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_put);
	    t3 += MP_TIMER()-stime;
	  } 
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,
			       i, &hdl_get)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	    t5 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_get);
	    t6 += MP_TIMER()-stime;
	  }
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; 
      MP_BARRIER();


      /* ------------------------ Accumulate ------------------------- */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();
	if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			  &ddst[0][0], &stride, &bytes, 0, 0)))
	  ARMCI_Error("armci_acc failed\n",rc);
	t7 += MP_TIMER()-stime;
	
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }

#if 1
      /* See the note below why this part is disabled */
      /* ---------------------- nb-Accumulate ------------------------ */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();    
	if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			    &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc)))
	  ARMCI_Error("armci_nbacc failed\n",rc);
	t8 += MP_TIMER()-stime; stime=MP_TIMER();
	ARMCI_Wait(&hdl_acc);
	t9 += MP_TIMER()-stime;
      
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }
#endif

      /* print timings */
     if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", 
		       bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, 
		       t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes);
    }

    ARMCI_AllFence();
    MP_BARRIER();
    
    if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);}
    destroy_array((void **)ddst);
    destroy_array((void **)dsrc);
}
Example #15
0
int main(int argc, char **argv)
{

    int i, rank, nranks, msgsize, dest;
    long bufsize;
    double **buffer;
    double t_start, t_stop, t_latency;
    int provided;

    ARMCI_Init_args(&argc, &argv);

    rank = A1_Process_id(A1_GROUP_WORLD);
    nranks = A1_Process_total(A1_GROUP_WORLD);

    bufsize = MAX_MSG_SIZE * (ITERATIONS + SKIP);
    buffer = (double **) malloc(sizeof(double *) * nranks);
    ARMCI_Malloc((void **) buffer, bufsize);

    for (i = 0; i < bufsize / sizeof(double); i++)
    {
        *(buffer[rank] + i) = 1.0 + rank;
    }

    A1_Barrier_group(A1_GROUP_WORLD);

    if (rank == 0)
    {

        printf("ARMCI_Get Latency in usec \n");
        printf("%20s %22s \n", "Message Size", "Latency");
        fflush(stdout);

        dest = 1;

        for (msgsize = sizeof(double); msgsize <= MAX_MSG_SIZE; msgsize *= 2)
        {

            for (i = 0; i < ITERATIONS + SKIP; i++)
            {

                if (i == SKIP) t_start = A1_Time_seconds();

                ARMCI_Get((void *) ((size_t) buffer[dest] + (size_t)(i
                        * msgsize)), (void *) ((size_t) buffer[rank]
                        + (size_t)(i * msgsize)), msgsize, 1);

            }
            t_stop = A1_Time_seconds();
            printf("%20d %20.2f \n", msgsize, ((t_stop - t_start) * 1000000)
                    / ITERATIONS);
            fflush(stdout);

            for (i = 0; i < ((ITERATIONS + SKIP) * msgsize) / sizeof(double); i++)
            {
                if (*(buffer[rank] + i) != (1.0 + dest))
                {
                    printf("Data validation failed At displacement : %d Expected : %f Actual : %f \n",
                           i,
                           (1.0 + dest),
                           *(buffer[rank] + i));
                    fflush(stdout);
                    return -1;
                }
            }

            for (i = 0; i < bufsize / sizeof(double); i++)
            {
                *(buffer[rank] + i) = 1.0 + rank;
            }
        }

    }

    A1_Barrier_group(A1_GROUP_WORLD);

    ARMCI_Free(buffer[rank]);

    ARMCI_Finalize();

    return 0;
}
Example #16
0
static void contig_test(size_t buffer_size, int op)
{
    void **dst_ptr;
    void **put_buf;
    void **get_buf;
    double *times;

    dst_ptr = (void*)malloc(nproc * sizeof(void*));
    put_buf = (void*)malloc(nproc * sizeof(void*));
    get_buf = (void*)malloc(nproc * sizeof(void*));
    times = (double*)malloc(nproc * sizeof(double));
    ARMCI_Malloc(dst_ptr, buffer_size);
    ARMCI_Malloc(put_buf, buffer_size);
    ARMCI_Malloc(get_buf, buffer_size);

    /* initialize what we're putting */
    fill_array((double*)put_buf[me], buffer_size/sizeof(double), me);

    size_t msg_size;

    int dst = 1;
    double scale = 1.0;
    for (msg_size = 16; msg_size <= buffer_size; msg_size *= 2) {

        int j;
        int iter = msg_size > MEDIUM_MESSAGE_SIZE ? ITER_LARGE : ITER_SMALL;

        double t_start, t_end;
        if (0 == me) {
            for (j= 0; j < iter + WARMUP; ++j) {

                if (WARMUP == j) {
                    t_start = dclock();
                }

                switch (op) {
                    case PUT:
                        ARMCI_Put(put_buf[me], dst_ptr[dst], msg_size,
                                dst);
                        break;
                    case GET:
                        ARMCI_Get(dst_ptr[dst], get_buf[me], msg_size,
                                dst);
                        break;
                    case ACC:
                        ARMCI_Acc(ARMCI_ACC_DBL, &scale, 
                                put_buf[me], dst_ptr[dst], msg_size,
                                dst);
                        break;
                    default:
                        ARMCI_Error("oops", 1);
                }

            }
        }
        /* calculate total time and average time */
        t_end = dclock();
        ARMCI_Barrier();


        if (0 == me) {
            printf("%8zu\t\t%6.2f\t\t%10.2f\n",
                    msg_size,
                    ((t_end  - t_start))/iter,
                    msg_size*iter/((t_end - t_start)));
        }
    }
    ARMCI_Free(dst_ptr[me]);
    ARMCI_Free(put_buf[me]);
    ARMCI_Free(get_buf[me]);
    free(dst_ptr);
    free(put_buf);
    free(get_buf);
    free(times);
}
Example #17
0
/** Non-blocking get operation.  Note: the implementation is not non-blocking
  */
int ARMCI_NbGet(void *src, void *dst, int bytes, int proc, armci_hdl_t *handle) {
  return ARMCI_Get(src, dst, bytes, proc);
}