コード例 #1
0
ファイル: perf.c プロジェクト: arnolda/scafacos
double time_acc(double *src_buf, double *dst_buf, int chunk, int loop,
                int proc, int levels)
{
    int i, bal = 0;

    int stride[2];
    int count[2];
    int stride_levels = levels;
    double *before_buf, *after_buf;
    
    double start_time, stop_time, total_time = 0;

    stride[0] = SIZE * sizeof(double);
    count[0] = chunk * sizeof(double); count[1] = chunk;

    if(CHECK_RESULT) {
        before_buf = (double *)malloc(SIZE * SIZE * sizeof(double));
        assert(before_buf != NULL);
        after_buf = (double *)malloc(SIZE * SIZE * sizeof(double));
        assert(after_buf != NULL);
    }
    
    start_time = TIMER();
    for(i=0; i<loop; i++) {
        double scale = (double)i;

        if(CHECK_RESULT) {
            ARMCI_GetS(dst_buf, stride, before_buf, stride, count,
                       stride_levels, proc);

            acc_array(scale, before_buf, src_buf, stride, count,stride_levels);
        }

        ARMCI_AccS(ARMCI_ACC_DBL, &scale, src_buf, stride, dst_buf, stride,
                   count, stride_levels, proc);

        if(CHECK_RESULT) {
            ARMCI_GetS(dst_buf, stride, after_buf, stride, count,
                       stride_levels, proc);
            
            sprintf(check_type, "ARMCI_AccS:");
            check_result(after_buf, before_buf, stride, count, stride_levels);
        }
        
        /* prepare next src and dst ptrs: avoid cache locality */
        if(bal == 0) {
            src_buf += 128;
            dst_buf += 128;
            bal = 1;
        } else {
            src_buf -= 128;
            dst_buf -= 128;
            bal = 0;
        }
    }
    stop_time = TIMER();
    total_time = (stop_time - start_time);

    if(CHECK_RESULT) { free(before_buf); free(after_buf); }
    
    if(total_time == 0.0){ 
       total_time=0.000001; /* workaround for inaccurate timers */
       warn_accuracy++;
    }
    return(total_time/loop);
}
コード例 #2
0
ファイル: test2.c プロジェクト: arnolda/scafacos
void test_acc_type(const int datatype)
{
    int i = 0;
    int datatype_size = 0;
    void * scale;
    void * a;
    void *b[MAXPROC];
    int elems = ELEMS;
    int dim = 1;
    int count = 0;
    int strideA = 0;
    int strideB = 0;

    switch(datatype)
    {
       case ARMCI_ACC_INT:
          datatype_size = sizeof(int);
          scale = malloc(datatype_size);
          *((int *) scale) = 1;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((int *) a)[i] = i + me;
             ((int *) b[me])[i] = 0;
          }
          break;
       case ARMCI_ACC_LNG:
          datatype_size = sizeof(long);
          scale = malloc(datatype_size);
          *((long *) scale) = 1;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((long *) a)[i] = i + me;
             ((long *) b[me])[i] = 0;
          }
          break;
       case ARMCI_ACC_FLT:
          datatype_size = sizeof(float);
          scale = malloc(datatype_size);
          *((float *) scale) = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((float *) a)[i] = (float) i + me;
             ((float *) b[me])[i] = 0.0;
          }
          break;
       case ARMCI_ACC_DBL:
          datatype_size = sizeof(double);
          scale = malloc(datatype_size);
          *((double *) scale) = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((double *) a)[i] = (double) i + me;
             ((double *) b[me])[i] = 0.0;
          }
          break;
       case ARMCI_ACC_CPL:
          datatype_size = sizeof(cmpl_t);
          scale = malloc(datatype_size);
          ((cmpl_t *) scale)->real = 2.0;
          ((cmpl_t *) scale)->imag = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((cmpl_t *) a)[i].real = ((float) i + me);
             ((cmpl_t *) a)[i].imag = ((float) i + me);
             ((cmpl_t *) b[me])[i].real = 0.0;
             ((cmpl_t *) b[me])[i].imag = 0.0;
          }
          break;
       case ARMCI_ACC_DCP:
          datatype_size = sizeof(dcmpl_t);
          scale = malloc(datatype_size);
          ((dcmpl_t *) scale)->real = 2.0;
          ((dcmpl_t *) scale)->imag = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((dcmpl_t *) a)[i].real = ((double) i + me);
             ((dcmpl_t *) a)[i].imag = ((double) i + me);
             ((dcmpl_t *) b[me])[i].real = 0.0;
             ((dcmpl_t *) b[me])[i].imag = 0.0;
          }
          break;
       default:
          return;
          break;
    }

    count = elems * datatype_size;
    strideA = elems * datatype_size;
    strideB = elems * datatype_size;

    ARMCI_AllFence();
    MP_BARRIER();

    for(i = 0; i < nproc; i++)
       ARMCI_AccS(datatype, scale, a, &strideA, b[(me + i) % nproc], &strideB, &count, 0, (me + i) % nproc);

    ARMCI_AllFence();
    MP_BARRIER();

    switch(datatype)
    {
       case ARMCI_ACC_INT:
          for(i = 0; i < elems; i++)
          {
             int compare = (i * nproc) + nproc / 2 * (nproc - 1);
             if(((int *)b[me])[i] != compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_INT [%d] = %d != %d\n", i, ((int *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_LNG:
          for(i = 0; i < elems; i++)
          {
             long compare = (i * nproc) + nproc / 2 * (nproc - 1);
             if(((long *)b[me])[i] != compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_LNG [%d] = %d != %ld\n", i, ((int *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_FLT:
          for(i = 0; i < elems; i++)
          {
             float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((float *)b[me])[i] != compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_FLT [%d] = %f != %f\n", i, ((float *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_DBL:
          for(i = 0; i < elems; i++)
          {
             double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((double *)b[me])[i] != (double) ((i * nproc) + nproc / 2 * (nproc - 1))) 
             {
                printf("ERROR accumulate ARMCI_ACC_DBL [%d] = %f != %f \n", i, ((double *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_CPL:
          for(i = 0; i < elems; i++)
          {
             float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((cmpl_t *)b[me])[i].real != compare && ((cmpl_t *)b[me])[i].imag != 3 * compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_CPL [%d] = %f + %fj != %f + %fj\n", i, ((cmpl_t *)b[me])[i].real, ((cmpl_t *)b[me])[i].imag, compare, 3 * compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_DCP:
          for(i = 0; i < elems; i++)
          {
             double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((dcmpl_t *)b[me])[i].real != compare && ((dcmpl_t *)b[me])[i].imag != 3 * compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_DCP [%d] = %f + %fj != %f + %fj\n", i, ((dcmpl_t *)b[me])[i].real, ((dcmpl_t *)b[me])[i].imag, compare, 3 * compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       default:
          break;
    }

    MP_BARRIER();
    ARMCI_AllFence();
    MP_BARRIER();
    
    if(me==0){printf("O.K.\n\n"); fflush(stdout);}    
    destroy_array((void**)b);
    free(a);
    free(scale);
}
コード例 #3
0
ファイル: test_mt.c プロジェクト: arnolda/scafacos
/* test Put/Get/Acc sequence regardless of communication pattern
 *  tgt -- remote target for put/get/acc (none if -1)
 *  rmt -- list of remote thread that put/acc to here (correctness is cheked here)
 *  rmt_cnt -- # of threads in rmt
 */
void test_PutGetAcc(int th_idx, int tgt, int *rmt, int rmt_cnt)
{
    /* a - local thread, b - remote thread */
    int a, b, b_proc, stride[2], count[2];
    int i, j;
    void *src, *dst;
#ifdef DEBUG
    for (i = 0, cbufl = 0; i < rmt_cnt; i++)
        cbufl += sprintf(cbuf+cbufl, " %d", rmt[i]);
    prndbg(th_idx, "test_PutGetAcc: put/acc to %d, get from %d, check put/acc from %s\n",
           tgt, tgt, rmt_cnt ? cbuf : "none");
#endif
    a = TH_ME;
    stride[0] = ASIZE_BYTES;
    count[0] = ASIZE_BYTES; count[1] = 1;

    /* init arrays */
    init_array(th_idx, ptrs1[TH_ME]);
    init_array(th_idx, ptrs2[TH_ME]);
    MT_BARRIER();

    /* put - put a.ptrs1[b] into b.ptrs2[a] */
    if (tgt != -1) {
        b = tgt;
        b_proc = TH2PROC(b);
        for (i = 0; i < iters; i++) {
            src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */
            dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */
//            assert(!ARMCI_Put(src, dst, ASIZE_BYTES, b_proc));
            assert(!ARMCI_PutS(src, stride, dst, stride, count, 1, b_proc));
        }
        ARMCI_Fence(b_proc);
    }
    MT_BARRIER();
    print_array(th_idx, "PUT:ptrs1[TH_ME]", ptrs1[TH_ME]);
    print_array(th_idx, "PUT:ptrs2[TH_ME]", ptrs2[TH_ME]);
    MT_BARRIER();

    /* chk put(s) from b(s): a.ptrs2[b] */
    for (j = 0; j < rmt_cnt; j++) {
        b = rmt[j];
        b_proc = TH2PROC(b);
        check_PutGetAcc(th_idx, b, PUT, &AELEM(ptrs2[a], b, 0, 0));
    }
//return; // REMOVE WHEN DONE

    /* init arrays */
    init_array(th_idx, ptrs1[TH_ME]);
    init_array(th_idx, ptrs2[TH_ME]);
    MT_BARRIER();

    /* get - get b.ptrs1[a] into a.ptrs2[b] */
    if (tgt != -1) {
        b = tgt;
        b_proc = TH2PROC(b);
        for (i = 0; i < iters; i++) {
            src = &AELEM(ptrs1[b], a, i, 0); /* b.ptrs1[a] */
            dst = &AELEM(ptrs2[a], b, i, 0); /* a.ptrs2[b] */
            assert(!ARMCI_GetS(src, stride, dst, stride, count, 1, b_proc));
        }
    }
    print_array(th_idx, "GET:ptrs1[TH_ME]", ptrs1[TH_ME]);
    print_array(th_idx, "GET:ptrs2[TH_ME]", ptrs2[TH_ME]);
    MT_BARRIER();

    /* chk get from b: a.ptrs2[b] */
    if (tgt != -1) {
        check_PutGetAcc(th_idx, b, GET, &AELEM(ptrs2[a], b, 0, 0));
    }

#if 1
    /* init arrays */
    init_array(th_idx, ptrs1[TH_ME]);
    init_array(th_idx, ptrs2[TH_ME]);
    MT_BARRIER();

    /* acc - acc a.ptrs1[b] * scale + b.ptrs2[a] into b.ptrs2[a] */
    if (tgt != -1) {
        b = tgt;
        b_proc = TH2PROC(b);
        for (i = 0; i < iters; i++) {
            src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */
            dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */
            assert(!ARMCI_AccS(ARMCI_ACC_DBL,&scale,src,stride,dst,stride,count,1,b_proc));
        }
        ARMCI_Fence(b_proc);
    }
    MT_BARRIER();
    print_array(th_idx, "ACC:ptrs1[TH_ME]", ptrs1[TH_ME]);
    print_array(th_idx, "ACC:ptrs2[TH_ME]", ptrs2[TH_ME]);
    MT_BARRIER();

    /* chk acc(s) from b(s): a.ptrs2[b] */
    for (j = 0; j < rmt_cnt; j++) {
        b = rmt[j];
        b_proc = TH2PROC(b);
        check_PutGetAcc(th_idx, b, ACC, &AELEM(ptrs2[a], b, 0, 0));
    }

#endif
    MT_BARRIER();
}
コード例 #4
0
int main(int argc, char **argv)
{

    int i, j, rank, nranks, peer;
    size_t xdim, ydim;
    unsigned long bufsize;
    double **buffer, *src_buf;
    double t_start=0.0, t_stop;
    int count[2], src_stride, trg_stride, stride_level;
    double scaling;
    int provided;

    MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    if (nranks < 2) {
        printf("%s: Must be run with at least 2 processes\n", argv[0]);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    ARMCI_Init_args(&argc, &argv);

    buffer = (double **) malloc(sizeof(double *) * nranks);

    bufsize = MAX_XDIM * MAX_YDIM * sizeof(double);
    ARMCI_Malloc((void **) buffer, bufsize);
    src_buf = ARMCI_Malloc_local(bufsize);

    if (rank == 0)
    {
        printf("ARMCI_AccS Latency - local and remote completions - in usec \n");
        printf("%30s %22s %22s\n",
               "Dimensions(array of double)",
               "Local Completion",
               "Remote completion");
        fflush(stdout);
    }

    ARMCI_Access_begin(buffer[rank]);
    for (i = 0; i < bufsize / sizeof(double); i++)
    {
      *(buffer[rank] + i) = 1.0 + rank;
      *(src_buf + i) = 1.0 + rank;
    }
    ARMCI_Access_end(buffer[rank]);

    scaling = 2.0;

    src_stride = MAX_YDIM * sizeof(double);
    trg_stride = MAX_YDIM * sizeof(double);
    stride_level = 1;

    ARMCI_Barrier();

    for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2)
    {

        count[1] = xdim;

        for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2)
        {

            count[0] = ydim * sizeof(double);

            if (rank == 0)
            {

                peer = 1;

                for (i = 0; i < ITERATIONS + SKIP; i++)
                {

                    if (i == SKIP) t_start = MPI_Wtime();

                    ARMCI_AccS(ARMCI_ACC_DBL,
                               (void *) &scaling,
                               /* (void *) buffer[rank] */ src_buf,
                               &src_stride,
                               (void *) buffer[peer],
                               &trg_stride,
                               count,
                               stride_level,
                               1);

                }
                t_stop = MPI_Wtime();
                ARMCI_Fence(1);

                char temp[10];
                sprintf(temp, "%dX%d", (int) xdim, (int) ydim);
                printf("%30s %20.2f ", temp, ((t_stop - t_start) * 1000000)
                        / ITERATIONS);
                fflush(stdout);

                ARMCI_Barrier();

                ARMCI_Barrier();

                for (i = 0; i < ITERATIONS + SKIP; i++)
                {

                    if (i == SKIP) t_start = MPI_Wtime();

                    ARMCI_AccS(ARMCI_ACC_DBL,
                               (void *) &scaling,
                               /* (void *) buffer[rank] */ src_buf,
                               &src_stride,
                               (void *) buffer[peer],
                               &trg_stride,
                               count,
                               stride_level,
                               1);
                    ARMCI_Fence(1);

                }
                t_stop = MPI_Wtime();
                printf("%20.2f \n", ((t_stop - t_start) * 1000000) / ITERATIONS);
                fflush(stdout);

                ARMCI_Barrier();

                ARMCI_Barrier();

            }
            else
            {

                peer = 0;

                ARMCI_Barrier();

                if (rank == 1) 
                {
                  ARMCI_Access_begin(buffer[rank]);
                  for (i = 0; i < xdim; i++)
                  {
                    for (j = 0; j < ydim; j++)
                    {
                      if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank)
                            + scaling * (1.0 + peer) * (ITERATIONS + SKIP)))
                      {
                        printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
                            i,
                            j,
                            ((1.0 + rank) + scaling * (1.0 + peer)),
                            *(buffer[rank] + i * MAX_YDIM + j));
                        fflush(stdout);
                        ARMCI_Error("Bailing out", 1);
                      }
                    }
                  }

                  for (i = 0; i < bufsize / sizeof(double); i++)
                  {
                    *(buffer[rank] + i) = 1.0 + rank;
                  }
                  ARMCI_Access_end(buffer[rank]);
                }

                ARMCI_Barrier();

                ARMCI_Barrier();

                if (rank == 1) 
                {
                  ARMCI_Access_begin(buffer[rank]);

                  for (i = 0; i < xdim; i++)
                  {
                    for (j = 0; j < ydim; j++)
                    {
                      if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank)
                            + scaling * (1.0 + peer) * (ITERATIONS + SKIP)))
                      {
                        printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
                            i,
                            j,
                            ((1.0 + rank) + scaling * (1.0 + peer)),
                            *(buffer[rank] + i * MAX_YDIM + j));
                        fflush(stdout);
                        ARMCI_Error("Bailing out", 1);
                      }
                    }
                  }

                  for (i = 0; i < bufsize / sizeof(double); i++)
                  {
                    *(buffer[rank] + i) = 1.0 + rank;
                  }

                  ARMCI_Access_end(buffer[rank]);
                }
                ARMCI_Barrier();

            }

        }

    }

    ARMCI_Barrier();

    ARMCI_Free((void *) buffer[rank]);
    ARMCI_Free_local(src_buf);
    free(buffer);

    ARMCI_Finalize();

    MPI_Finalize();

    return 0;
}
コード例 #5
0
ファイル: ddi_armci_acc.c プロジェクト: ryanolson/ddi
inline int DDI_ARMCI_Acc_proc(DDI_Patch *patch, void *scale, void *buf, int proc) {
    int handle = patch->handle;
    int nops = 1;

    DDA_ARMCI_Index *armci_index = gv(dda_armci_index)[handle];
    int trows,tcols,nrows,ncols;
    size_t offset;
    char *dst,*src = (char*)buf;
    int src_stride_arr[2],dst_stride_arr[2],count[2];
    int stride_levels = 1;
    int armci_proc;
    
    trows = gv(dda_index)[handle].nrows;
    tcols = gv(pcmap)[handle][proc+1] - gv(pcmap)[handle][proc];
    nrows = patch->ihi - patch->ilo + 1;
    ncols = patch->jhi - patch->jlo + 1;
    
    offset = (patch->jlo - gv(pcmap)[handle][proc])*trows + patch->ilo;
    offset *= sizeof(double);
    
    DDI_ARMCI_Acquire(armci_index,handle,proc,DDI_WRITE_ACCESS,(void**)&dst,&armci_proc);
    dst += offset;
    
    if (nrows == trows) {
	src_stride_arr[0] = sizeof(double)*nrows;
	dst_stride_arr[0] = sizeof(double)*trows;
	count[0] = patch->size;
	stride_levels = 0;
	

#if defined DDI_ARMCI_IMPLICIT_NBACC
	// Apparantely ARMCI_Acc is not always present
	//ARMCI_Acc(ARMCI_ACC_DBL, 1.0, (void*)src, (void*)dst, subp[i].size, armci_proc, NULL);
	ARMCI_NbAccS(ARMCI_ACC_DBL, scale,
		     (void*)src, src_stride_arr,
		     (void*)dst, dst_stride_arr,
		     count, stride_levels, armci_proc, NULL);
#else
	// Apparantely ARMCI_Acc is not always present
	//ARMCI_Acc(ARMCI_ACC_DBL, 1.0, (void*)src, (void*)dst, subp[i].size, armci_proc);
	ARMCI_AccS(ARMCI_ACC_DBL, scale,
		   (void*)src, src_stride_arr,
		   (void*)dst, dst_stride_arr,
		   count, stride_levels, armci_proc);
#endif
    }
    else {
	// i dimensions
	src_stride_arr[0] = sizeof(double)*nrows;
	dst_stride_arr[0] = sizeof(double)*trows;
	// j dimensions
	src_stride_arr[1] = src_stride_arr[0]*ncols;
	dst_stride_arr[1] = dst_stride_arr[0]*tcols;
	// block size count, first dimension must be in bytes 
	count[0] = sizeof(double)*nrows;
	count[1] = ncols;
	stride_levels = 1;
	
#if defined DDI_ARMCI_IMPLICIT_NBACC
	ARMCI_NbAccS(ARMCI_ACC_DBL, scale,
		     (void*)src, src_stride_arr,
		     (void*)dst, dst_stride_arr,
		     count, stride_levels, armci_proc, NULL);
#else
	ARMCI_AccS(ARMCI_ACC_DBL, scale,
		   (void*)src, src_stride_arr,
		   (void*)dst, dst_stride_arr,
		   count, stride_levels, armci_proc);
#endif
    }
    
    DDI_ARMCI_Release(armci_index,handle,proc,DDI_WRITE_ACCESS);

    return nops;
}
コード例 #6
0
ファイル: perf_nb.c プロジェクト: dmlb2000/nwchem-cml
void test_perf_nb(int dry_run) {
  
    int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
    int stride, k=0, ntimes;
    double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9;
    double *dsrc[MAXPROC], scale=1.0;
    armci_hdl_t hdl_get, hdl_put, hdl_acc;
        
    create_array((void**)ddst, sizeof(double),2, elems);
    create_array((void**)dsrc, sizeof(double),1, &elems[1]);

    if(!dry_run)if(me == 0) {
      printf("\n\t\t\tRemote 1-D Array Section\n");
      printf("section    get      nbget    wait     put     nbput  ");
      printf("   wait     acc     nbacc     wait\n");
      printf("-------  -------- -------- -------- -------- --------");
      printf(" -------- -------- -------- --------\n");
      fflush(stdout);
    }

    for(loop=1; loop<=MAXELEMS; loop*=2, k++) {

      elems[1] = loop;
      ntimes = (int)sqrt((double)(MAXELEMS/elems[1]));
      if(ntimes <1) ntimes=1;

      /* -------------------------- SETUP --------------------------- */
      /*initializing non-blocking handles,time,src & dst buffers*/
      ARMCI_INIT_HANDLE(&hdl_put);
      ARMCI_INIT_HANDLE(&hdl_get);
      ARMCI_INIT_HANDLE(&hdl_acc);
      t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0;
      for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;    
      MP_BARRIER();
      
      /* bytes transfered */
      bytes = sizeof(double)*elems[1]; 
      MP_BARRIER();
      
      /* -------------------------- PUT/GET -------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	  t1 += MP_TIMER()-stime;
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();
      
      if(me == 0) { 
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();    
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	  t4 += MP_TIMER()-stime;	
	}
      }    
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      /* ------------------------ nb PUT/GET ------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,
			       i, &hdl_put)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	    t2 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_put);
	    t3 += MP_TIMER()-stime;
	  } 
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,
			       i, &hdl_get)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	    t5 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_get);
	    t6 += MP_TIMER()-stime;
	  }
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; 
      MP_BARRIER();


      /* ------------------------ Accumulate ------------------------- */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();
	if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			  &ddst[0][0], &stride, &bytes, 0, 0)))
	  ARMCI_Error("armci_acc failed\n",rc);
	t7 += MP_TIMER()-stime;
	
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }

#if 1
      /* See the note below why this part is disabled */
      /* ---------------------- nb-Accumulate ------------------------ */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();    
	if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			    &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc)))
	  ARMCI_Error("armci_nbacc failed\n",rc);
	t8 += MP_TIMER()-stime; stime=MP_TIMER();
	ARMCI_Wait(&hdl_acc);
	t9 += MP_TIMER()-stime;
      
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }
#endif

      /* print timings */
     if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", 
		       bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, 
		       t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes);
    }

    ARMCI_AllFence();
    MP_BARRIER();
    
    if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);}
    destroy_array((void **)ddst);
    destroy_array((void **)dsrc);
}
コード例 #7
0
int main(int argc, char **argv) {
    int i, j, rank, nranks, peer, bufsize, errors, total_errors;
    double **buf_bvec, **src_bvec, *src_buf;
    int count[2], src_stride, trg_stride, stride_level;
    double scaling, time;

    MPI_Init(&argc, &argv);
    ARMCI_Init();

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    buf_bvec = (double **) malloc(sizeof(double *) * nranks);
    src_bvec = (double **) malloc(sizeof(double *) * nranks);

    bufsize = XDIM * YDIM * sizeof(double);
    ARMCI_Malloc((void **) buf_bvec, bufsize);
    ARMCI_Malloc((void **) src_bvec, bufsize);
    src_buf = src_bvec[rank];

    if (rank == 0)
        printf("ARMCI Strided DLA Accumulate Test:\n");

    ARMCI_Access_begin(buf_bvec[rank]);
    ARMCI_Access_begin(src_buf);

    for (i = 0; i < XDIM*YDIM; i++) {
        *(buf_bvec[rank] + i) = 1.0 + rank;
        *(src_buf + i) = 1.0 + rank;
    }

    ARMCI_Access_end(src_buf);
    ARMCI_Access_end(buf_bvec[rank]);

    scaling = 2.0;

    src_stride = XDIM * sizeof(double);
    trg_stride = XDIM * sizeof(double);
    stride_level = 1;

    count[1] = YDIM;
    count[0] = XDIM * sizeof(double);

    ARMCI_Barrier();
    time = MPI_Wtime();

    peer = (rank+1) % nranks;

    for (i = 0; i < ITERATIONS; i++) {

      ARMCI_AccS(ARMCI_ACC_DBL,
          (void *) &scaling,
          src_buf,
          &src_stride,
          (void *) buf_bvec[peer],
          &trg_stride,
          count,
          stride_level,
          peer);
    }

    ARMCI_Barrier();
    time = MPI_Wtime() - time;

    if (rank == 0) printf("Time: %f sec\n", time);

    ARMCI_Access_begin(buf_bvec[rank]);
    for (i = errors = 0; i < XDIM; i++) {
      for (j = 0; j < YDIM; j++) {
        const double actual   = *(buf_bvec[rank] + i + j*XDIM);
        const double expected = (1.0 + rank) + scaling * (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);
          errors++;
          fflush(stdout);
        }
      }
    }
    ARMCI_Access_end(buf_bvec[rank]);

    MPI_Allreduce(&errors, &total_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

    ARMCI_Free((void *) buf_bvec[rank]);
    ARMCI_Free((void *) src_bvec[rank]);

    free(buf_bvec);
    free(src_bvec);

    ARMCI_Finalize();
    MPI_Finalize();

    if (total_errors == 0) {
      if (rank == 0) printf("Success.\n");
      return 0;
    } else {
      if (rank == 0) printf("Fail.\n");
      return 1;
    }
}