Example #1
0
int main(int argc, char **argv) {
    int i, j, rank, nranks, peer, bufsize, errors;
    double **buffer, *src_buf;
    int count[2], src_stride, trg_stride, stride_level;

    MPI_Init(&argc, &argv);
    ARMCI_Init();

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    buffer = (double **) malloc(sizeof(double *) * nranks);

    bufsize = XDIM * YDIM * sizeof(double);
    ARMCI_Malloc((void **) buffer, bufsize);
    src_buf = ARMCI_Malloc_local(bufsize);

    if (rank == 0)
        printf("ARMCI Strided Put Test:\n");

    src_stride = XDIM * sizeof(double);
    trg_stride = XDIM * sizeof(double);
    stride_level = 1;

    count[1] = YDIM;
    count[0] = XDIM * sizeof(double);

    ARMCI_Barrier();

    peer = (rank+1) % nranks;

    for (i = 0; i < ITERATIONS; i++) {

      for (j = 0; j < XDIM*YDIM; j++) {
        *(src_buf + j) = rank + i;
      }

      ARMCI_PutS(
          src_buf,
          &src_stride,
          (void *) buffer[peer],
          &trg_stride,
          count,
          stride_level,
          peer);
    }

    ARMCI_Barrier();

    ARMCI_Access_begin(buffer[rank]);
    for (i = errors = 0; i < XDIM; i++) {
      for (j = 0; j < YDIM; j++) {
        const double actual   = *(buffer[rank] + i + j*XDIM);
        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS);
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);
          errors++;
          fflush(stdout);
        }
      }
    }
    ARMCI_Access_end(buffer[rank]);

    ARMCI_Free((void *) buffer[rank]);
    ARMCI_Free_local(src_buf);
    free(buffer);

    ARMCI_Finalize();
    MPI_Finalize();

    if (errors == 0) {
      printf("%d: Success\n", rank);
      return 0;
    } else {
      printf("%d: Fail\n", rank);
      return 1;
    }
}
Example #2
0
void test_notify(int ndim)
{
  int lo[MAXDIMS], hi[MAXDIMS], count[MAXDIMS];
  int stride[MAXDIMS];
  int dim, elems;
  int i, Idx = 1, idx = 0;
  void *b[MAXPROC], *a[MAXPROC];
  int left = (me + nproc - 1) % nproc;
  int right = (me + 1) % nproc;
  int loopcnt = 1, less = 2, strl; /* less>1 takes a partial plane */


  /* create shared and local arrays */
  create_array(b, sizeof(double), ndim, dimsB);
  create_array(a, sizeof(double), ndim, dimsB);

  elems = get_elems(ndim, stride, dimsB, sizeof(double));
  init((double *)a[me], ndim, elems, dimsB);

  for (i = 0; i < ndim; i++) {
    lo[i] = 0;
    hi[i] = (less > dimsB[i]) ? dimsB[i] - 1 : dimsB[i] - less;
    count[i] = hi[i] - lo[i] + 1;
  }
  count[0] *= sizeof(double);

  for (i = 0; i < ndim - 1; i++) {
    Idx *= dimsB[i];
  }

  ARMCI_Barrier();
  if (me == 0) {
    printf("--------array[%d", dimsB[0]);
    for (dim = 1; dim < ndim; dim++) {
      printf(",%d", dimsB[dim]);
    }
    printf("]--------\n");
    fflush(stdout);
  }

  ARMCI_Barrier();
  loopcnt = (ndim > 1) ? dimsB[ndim-1] : 1;
  strl    = (ndim > 1) ? ndim - 2 : 0; /* strides of the subpatch to transfer */

  for (i = 0; i < loopcnt; i++) {
    int wc;

    if (me == 0) {

      ARMCI_PutS((double *)a[me] + idx, stride,
                 (double *)b[left] + idx, stride, count, strl, left);
#if DEBUG_
      printf("%d-%d: ps=%p pd=%p i=%d idx=%d count=%d\n", me, left, (double *)
             a[me] + idx, (double *)b[left] + idx, i, idx, count[0]);
      fflush(stdout);
#endif
      (void)armci_notify(left);
      (void)armci_notify_wait(right, &wc);

    }
    else {


      (void)armci_notify_wait(right, &wc);
      ARMCI_PutS((double *)b[me] + idx, stride,
                 (double *)b[left] + idx, stride, count, strl, left);
#if DEBUG_
      printf("%d: ps=%p pd=%p i=%d idx=%d count=%d\n", me, (double *)b[me] + idx,
             (double *)b[left] + idx, i, idx, count[0]);
      fflush(stdout);
#endif
      (void)armci_notify(left);
    }

    idx += Idx; /* advance to the next slab */
  }

  ARMCI_Barrier();

  if (me == 0) {
    compare_patches(0., ndim, (double *)a[0], lo, hi, dimsB,
                    (double *)b[0], lo, hi, dimsB);
    printf("OK\n");
  }

  ARMCI_Barrier();
  destroy_array(b);
  destroy_array(a);
}
Example #3
0
/* test Put/Get/Acc sequence regardless of communication pattern
 *  tgt -- remote target for put/get/acc (none if -1)
 *  rmt -- list of remote thread that put/acc to here (correctness is cheked here)
 *  rmt_cnt -- # of threads in rmt
 */
void test_PutGetAcc(int th_idx, int tgt, int *rmt, int rmt_cnt)
{
    /* a - local thread, b - remote thread */
    int a, b, b_proc, stride[2], count[2];
    int i, j;
    void *src, *dst;
#ifdef DEBUG
    for (i = 0, cbufl = 0; i < rmt_cnt; i++)
        cbufl += sprintf(cbuf+cbufl, " %d", rmt[i]);
    prndbg(th_idx, "test_PutGetAcc: put/acc to %d, get from %d, check put/acc from %s\n",
           tgt, tgt, rmt_cnt ? cbuf : "none");
#endif
    a = TH_ME;
    stride[0] = ASIZE_BYTES;
    count[0] = ASIZE_BYTES; count[1] = 1;

    /* init arrays */
    init_array(th_idx, ptrs1[TH_ME]);
    init_array(th_idx, ptrs2[TH_ME]);
    MT_BARRIER();

    /* put - put a.ptrs1[b] into b.ptrs2[a] */
    if (tgt != -1) {
        b = tgt;
        b_proc = TH2PROC(b);
        for (i = 0; i < iters; i++) {
            src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */
            dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */
//            assert(!ARMCI_Put(src, dst, ASIZE_BYTES, b_proc));
            assert(!ARMCI_PutS(src, stride, dst, stride, count, 1, b_proc));
        }
        ARMCI_Fence(b_proc);
    }
    MT_BARRIER();
    print_array(th_idx, "PUT:ptrs1[TH_ME]", ptrs1[TH_ME]);
    print_array(th_idx, "PUT:ptrs2[TH_ME]", ptrs2[TH_ME]);
    MT_BARRIER();

    /* chk put(s) from b(s): a.ptrs2[b] */
    for (j = 0; j < rmt_cnt; j++) {
        b = rmt[j];
        b_proc = TH2PROC(b);
        check_PutGetAcc(th_idx, b, PUT, &AELEM(ptrs2[a], b, 0, 0));
    }
//return; // REMOVE WHEN DONE

    /* init arrays */
    init_array(th_idx, ptrs1[TH_ME]);
    init_array(th_idx, ptrs2[TH_ME]);
    MT_BARRIER();

    /* get - get b.ptrs1[a] into a.ptrs2[b] */
    if (tgt != -1) {
        b = tgt;
        b_proc = TH2PROC(b);
        for (i = 0; i < iters; i++) {
            src = &AELEM(ptrs1[b], a, i, 0); /* b.ptrs1[a] */
            dst = &AELEM(ptrs2[a], b, i, 0); /* a.ptrs2[b] */
            assert(!ARMCI_GetS(src, stride, dst, stride, count, 1, b_proc));
        }
    }
    print_array(th_idx, "GET:ptrs1[TH_ME]", ptrs1[TH_ME]);
    print_array(th_idx, "GET:ptrs2[TH_ME]", ptrs2[TH_ME]);
    MT_BARRIER();

    /* chk get from b: a.ptrs2[b] */
    if (tgt != -1) {
        check_PutGetAcc(th_idx, b, GET, &AELEM(ptrs2[a], b, 0, 0));
    }

#if 1
    /* init arrays */
    init_array(th_idx, ptrs1[TH_ME]);
    init_array(th_idx, ptrs2[TH_ME]);
    MT_BARRIER();

    /* acc - acc a.ptrs1[b] * scale + b.ptrs2[a] into b.ptrs2[a] */
    if (tgt != -1) {
        b = tgt;
        b_proc = TH2PROC(b);
        for (i = 0; i < iters; i++) {
            src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */
            dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */
            assert(!ARMCI_AccS(ARMCI_ACC_DBL,&scale,src,stride,dst,stride,count,1,b_proc));
        }
        ARMCI_Fence(b_proc);
    }
    MT_BARRIER();
    print_array(th_idx, "ACC:ptrs1[TH_ME]", ptrs1[TH_ME]);
    print_array(th_idx, "ACC:ptrs2[TH_ME]", ptrs2[TH_ME]);
    MT_BARRIER();

    /* chk acc(s) from b(s): a.ptrs2[b] */
    for (j = 0; j < rmt_cnt; j++) {
        b = rmt[j];
        b_proc = TH2PROC(b);
        check_PutGetAcc(th_idx, b, ACC, &AELEM(ptrs2[a], b, 0, 0));
    }

#endif
    MT_BARRIER();
}
Example #4
0
double time_put(double *src_buf, double *dst_buf, int chunk, int loop,
                int proc, int levels)
{
    int i, bal = 0;

    int stride[2];
    int count[2];
    int stride_levels = levels;
    double *tmp_buf;

    double start_time, stop_time, total_time = 0;

    stride[0] = SIZE * sizeof(double);
    count[0] = chunk * sizeof(double); count[1] = chunk;

    if(CHECK_RESULT) {
        tmp_buf = (double *)malloc(SIZE * SIZE * sizeof(double));
        assert(tmp_buf != NULL);
    }
    
    start_time = TIMER();
    for(i=0; i<loop; i++) {

#ifdef FORCE_1D
        int j;
        if(levels>0)for(j=0; j< count[1]; j++){
           char *s = (char*) src_buf, *d= (char*)dst_buf;
           s += j*stride[0]; d += j*stride[0];
           ARMCI_Put(src_buf, dst_buf, count[0],proc);
        }
        else
#endif
        if(levels)
           ARMCI_PutS(src_buf, stride, dst_buf, stride, count, stride_levels,proc);
        else
           ARMCI_Put(src_buf, dst_buf,count[0], proc);

        if(CHECK_RESULT) {
            ARMCI_GetS(dst_buf, stride, tmp_buf, stride, count,
                       stride_levels, proc);

            sprintf(check_type, "ARMCI_PutS:");
            check_result(tmp_buf, src_buf, stride, count, stride_levels);
        }
        
        /* prepare next src and dst ptrs: avoid cache locality */
        if(bal == 0) {
            src_buf += 128;
            dst_buf += 128;
            bal = 1;
        } else {
            src_buf -= 128;
            dst_buf -= 128;
            bal = 0;
        }
    }
    stop_time = TIMER();
    total_time = (stop_time - start_time);

    if(CHECK_RESULT) free(tmp_buf);
    
    if(total_time == 0.0){ 
       total_time=0.000001; /* workaround for inaccurate timers */
       warn_accuracy++;
    }
    return(total_time/loop);
}
int main(int argc, char *argv[]) {

   int i, j, rank, nranks;
   int xdim, ydim;
   long bufsize;
   double **buffer;
   double t_start=0.0, t_stop=0.0;
   int count[2], src_stride, trg_stride, stride_level, peer;
   double expected, actual;
   int provided;

   MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided);
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    if (nranks < 2) {
        printf("%s: Must be run with at least 2 processes\n", argv[0]);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

   ARMCI_Init_args(&argc, &argv);
   
   bufsize = MAX_XDIM * MAX_YDIM * sizeof(double);
   buffer = (double **) malloc(sizeof(double *) * nranks);
   ARMCI_Malloc((void **) buffer, bufsize);

   for(i=0; i< bufsize/sizeof(double); i++) {
       *(buffer[rank] + i) = 1.0 + rank;
   }

   if(rank == 0) {
     printf("ARMCI_PutS Latency - local and remote completions - in usec \n");
     printf("%30s %22s %22s\n", "Dimensions(array of doubles)", "Latency-LocalCompeltion", "Latency-RemoteCompletion");
     fflush(stdout);
   }

   src_stride = MAX_YDIM*sizeof(double);
   trg_stride = MAX_YDIM*sizeof(double);
   stride_level = 1;

   ARMCI_Barrier();

   for(xdim=1; xdim<=MAX_XDIM; xdim*=2) {

      count[1] = xdim;

      for(ydim=1; ydim<=MAX_YDIM; ydim*=2) {

        count[0] = ydim*sizeof(double); 
      
        if(rank == 0) 
        {
          peer = 1;          
 
          for(i=0; i<ITERATIONS+SKIP; i++) { 

             if(i == SKIP)
                 t_start = MPI_Wtime();

             ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); 
 
          }
          t_stop = MPI_Wtime();
          ARMCI_Fence(peer);
          char temp[10]; 
          sprintf(temp,"%dX%d", xdim, ydim);
          printf("%30s %20.2f", temp, ((t_stop-t_start)*1000000)/ITERATIONS);
          fflush(stdout);

          ARMCI_Barrier();

          ARMCI_Barrier();

          for(i=0; i<ITERATIONS+SKIP; i++) {
  
             if(i == SKIP)
                t_start = MPI_Wtime();

             ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); 
             ARMCI_Fence(peer);

          }
          t_stop = MPI_Wtime();
          printf("%20.2f \n", ((t_stop-t_start)*1000000)/ITERATIONS);
          fflush(stdout);

          ARMCI_Barrier();

          ARMCI_Barrier();
        }
        else
        {
            peer = 0;

            expected = (1.0 + (double) peer);

            ARMCI_Barrier();
            if (rank == 1)
            {
              for(i=0; i<xdim; i++)
              {
                for(j=0; j<ydim; j++)
                {
                  actual = *(buffer[rank] + i*MAX_YDIM + j);
                  if(actual != expected)
                  {
                    printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
                        i, j, expected, actual);
                    fflush(stdout);
                    ARMCI_Error("Bailing out", 1);
                  }
                }
              }
            }
            for(i=0; i< bufsize/sizeof(double); i++) {
              *(buffer[rank] + i) = 1.0 + rank;
            }

            ARMCI_Barrier();

            ARMCI_Barrier();
            if (rank == 1)
            {
              for(i=0; i<xdim; i++)
              {
                for(j=0; j<ydim; j++)
                {
                  actual = *(buffer[rank] + i*MAX_YDIM + j);
                  if(actual != expected)
                  {
                    printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
                        i, j, expected, actual);
                    fflush(stdout);
                    ARMCI_Error("Bailing out", 1);
                  }
                }
              }

              for(i=0; i< bufsize/sizeof(double); i++) {
                *(buffer[rank] + i) = 1.0 + rank;
              }
            }
            ARMCI_Barrier();

        }
        
      }

   }

   ARMCI_Barrier();

   ARMCI_Free((void *) buffer[rank]);
   free(buffer);

   ARMCI_Finalize();

   MPI_Finalize();

   return 0;
}