Пример #1
0
double time_get(double *src_buf, double *dst_buf, int chunk, int loop,
                int proc, int levels)
{
    int i, bal = 0;
    
    int stride[2];
    int count[2];
    int stride_levels = levels;
    double *tmp_buf, *tmp_buf_ptr;
    
    double start_time, stop_time, total_time = 0;

    stride[0] = SIZE * sizeof(double);
    count[0] = chunk * sizeof(double); count[1] = chunk;

    if(CHECK_RESULT) {
        tmp_buf = (double *)malloc(SIZE * SIZE * sizeof(double));
        assert(tmp_buf != NULL);

        fill_array(tmp_buf, SIZE*SIZE, proc);
        tmp_buf_ptr = tmp_buf;
    }
    
    start_time = TIMER();
    for(i=0; i<loop; i++) {
         
#ifdef FORCE_1D
        int j;
        if(levels>0)for(j=0; j< count[1]; j++){
           char *s = (char*) src_buf, *d= (char*)dst_buf;
           s += j*stride[0]; d += j*stride[0];
           ARMCI_Get(src_buf, dst_buf, count[0],proc);
        }
        else
#endif
        if(levels)
           ARMCI_GetS(src_buf, stride, dst_buf, stride, count, stride_levels,proc);
        else
           ARMCI_Get(src_buf, dst_buf,count[0], proc);

        if(CHECK_RESULT) {
            sprintf(check_type, "ARMCI_GetS:");
            check_result(tmp_buf_ptr, dst_buf, stride, count, stride_levels);
        }
        
        /* prepare next src and dst ptrs: avoid cache locality */
        if(bal == 0) {
            src_buf += 128;
            dst_buf += 128;
            if(CHECK_RESULT) tmp_buf_ptr += 128;
            bal = 1;
        } else {
            src_buf -= 128;
            dst_buf -= 128;
            if(CHECK_RESULT) tmp_buf_ptr -= 128;
            bal = 0;
        }
    }
    stop_time = TIMER();
    total_time = (stop_time - start_time);

    if(CHECK_RESULT) free(tmp_buf);

    if(total_time == 0.0){
       total_time=0.000001; /* workaround for inaccurate timers */
       warn_accuracy++;
    }
    return(total_time/loop);
}
Пример #2
0
/* test Put/Get/Acc sequence regardless of communication pattern
 *  tgt -- remote target for put/get/acc (none if -1)
 *  rmt -- list of remote thread that put/acc to here (correctness is cheked here)
 *  rmt_cnt -- # of threads in rmt
 */
void test_PutGetAcc(int th_idx, int tgt, int *rmt, int rmt_cnt)
{
    /* a - local thread, b - remote thread */
    int a, b, b_proc, stride[2], count[2];
    int i, j;
    void *src, *dst;
#ifdef DEBUG
    for (i = 0, cbufl = 0; i < rmt_cnt; i++)
        cbufl += sprintf(cbuf+cbufl, " %d", rmt[i]);
    prndbg(th_idx, "test_PutGetAcc: put/acc to %d, get from %d, check put/acc from %s\n",
           tgt, tgt, rmt_cnt ? cbuf : "none");
#endif
    a = TH_ME;
    stride[0] = ASIZE_BYTES;
    count[0] = ASIZE_BYTES; count[1] = 1;

    /* init arrays */
    init_array(th_idx, ptrs1[TH_ME]);
    init_array(th_idx, ptrs2[TH_ME]);
    MT_BARRIER();

    /* put - put a.ptrs1[b] into b.ptrs2[a] */
    if (tgt != -1) {
        b = tgt;
        b_proc = TH2PROC(b);
        for (i = 0; i < iters; i++) {
            src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */
            dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */
//            assert(!ARMCI_Put(src, dst, ASIZE_BYTES, b_proc));
            assert(!ARMCI_PutS(src, stride, dst, stride, count, 1, b_proc));
        }
        ARMCI_Fence(b_proc);
    }
    MT_BARRIER();
    print_array(th_idx, "PUT:ptrs1[TH_ME]", ptrs1[TH_ME]);
    print_array(th_idx, "PUT:ptrs2[TH_ME]", ptrs2[TH_ME]);
    MT_BARRIER();

    /* chk put(s) from b(s): a.ptrs2[b] */
    for (j = 0; j < rmt_cnt; j++) {
        b = rmt[j];
        b_proc = TH2PROC(b);
        check_PutGetAcc(th_idx, b, PUT, &AELEM(ptrs2[a], b, 0, 0));
    }
//return; // REMOVE WHEN DONE

    /* init arrays */
    init_array(th_idx, ptrs1[TH_ME]);
    init_array(th_idx, ptrs2[TH_ME]);
    MT_BARRIER();

    /* get - get b.ptrs1[a] into a.ptrs2[b] */
    if (tgt != -1) {
        b = tgt;
        b_proc = TH2PROC(b);
        for (i = 0; i < iters; i++) {
            src = &AELEM(ptrs1[b], a, i, 0); /* b.ptrs1[a] */
            dst = &AELEM(ptrs2[a], b, i, 0); /* a.ptrs2[b] */
            assert(!ARMCI_GetS(src, stride, dst, stride, count, 1, b_proc));
        }
    }
    print_array(th_idx, "GET:ptrs1[TH_ME]", ptrs1[TH_ME]);
    print_array(th_idx, "GET:ptrs2[TH_ME]", ptrs2[TH_ME]);
    MT_BARRIER();

    /* chk get from b: a.ptrs2[b] */
    if (tgt != -1) {
        check_PutGetAcc(th_idx, b, GET, &AELEM(ptrs2[a], b, 0, 0));
    }

#if 1
    /* init arrays */
    init_array(th_idx, ptrs1[TH_ME]);
    init_array(th_idx, ptrs2[TH_ME]);
    MT_BARRIER();

    /* acc - acc a.ptrs1[b] * scale + b.ptrs2[a] into b.ptrs2[a] */
    if (tgt != -1) {
        b = tgt;
        b_proc = TH2PROC(b);
        for (i = 0; i < iters; i++) {
            src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */
            dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */
            assert(!ARMCI_AccS(ARMCI_ACC_DBL,&scale,src,stride,dst,stride,count,1,b_proc));
        }
        ARMCI_Fence(b_proc);
    }
    MT_BARRIER();
    print_array(th_idx, "ACC:ptrs1[TH_ME]", ptrs1[TH_ME]);
    print_array(th_idx, "ACC:ptrs2[TH_ME]", ptrs2[TH_ME]);
    MT_BARRIER();

    /* chk acc(s) from b(s): a.ptrs2[b] */
    for (j = 0; j < rmt_cnt; j++) {
        b = rmt[j];
        b_proc = TH2PROC(b);
        check_PutGetAcc(th_idx, b, ACC, &AELEM(ptrs2[a], b, 0, 0));
    }

#endif
    MT_BARRIER();
}
Пример #3
0
double time_acc(double *src_buf, double *dst_buf, int chunk, int loop,
                int proc, int levels)
{
    int i, bal = 0;

    int stride[2];
    int count[2];
    int stride_levels = levels;
    double *before_buf, *after_buf;
    
    double start_time, stop_time, total_time = 0;

    stride[0] = SIZE * sizeof(double);
    count[0] = chunk * sizeof(double); count[1] = chunk;

    if(CHECK_RESULT) {
        before_buf = (double *)malloc(SIZE * SIZE * sizeof(double));
        assert(before_buf != NULL);
        after_buf = (double *)malloc(SIZE * SIZE * sizeof(double));
        assert(after_buf != NULL);
    }
    
    start_time = TIMER();
    for(i=0; i<loop; i++) {
        double scale = (double)i;

        if(CHECK_RESULT) {
            ARMCI_GetS(dst_buf, stride, before_buf, stride, count,
                       stride_levels, proc);

            acc_array(scale, before_buf, src_buf, stride, count,stride_levels);
        }

        ARMCI_AccS(ARMCI_ACC_DBL, &scale, src_buf, stride, dst_buf, stride,
                   count, stride_levels, proc);

        if(CHECK_RESULT) {
            ARMCI_GetS(dst_buf, stride, after_buf, stride, count,
                       stride_levels, proc);
            
            sprintf(check_type, "ARMCI_AccS:");
            check_result(after_buf, before_buf, stride, count, stride_levels);
        }
        
        /* prepare next src and dst ptrs: avoid cache locality */
        if(bal == 0) {
            src_buf += 128;
            dst_buf += 128;
            bal = 1;
        } else {
            src_buf -= 128;
            dst_buf -= 128;
            bal = 0;
        }
    }
    stop_time = TIMER();
    total_time = (stop_time - start_time);

    if(CHECK_RESULT) { free(before_buf); free(after_buf); }
    
    if(total_time == 0.0){ 
       total_time=0.000001; /* workaround for inaccurate timers */
       warn_accuracy++;
    }
    return(total_time/loop);
}
Пример #4
0
int main(int argc, char **argv) {
    int i, j, rank, nranks, peer, bufsize, errors;
    double **buffer, *src_buf, *dst_buf;
    int count[2], src_stride, trg_stride, stride_level;

    MPI_Init(&argc, &argv);
    ARMCI_Init();

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    buffer = (double **) malloc(sizeof(double *) * nranks);

    bufsize = XDIM * YDIM * sizeof(double);
    ARMCI_Malloc((void **) buffer, bufsize);
    src_buf = ARMCI_Malloc_local(bufsize);
    dst_buf = ARMCI_Malloc_local(bufsize);

    if (rank == 0)
        printf("ARMCI Strided Put Test:\n");

    src_stride = XDIM * sizeof(double);
    trg_stride = XDIM * sizeof(double);
    stride_level = 1;

    count[1] = YDIM;
    count[0] = XDIM * sizeof(double);

    ARMCI_Barrier();

    peer = (rank+1) % nranks;

    for (i = 0; i < ITERATIONS; i++) {

      for (j = 0; j < XDIM*YDIM; j++) {
        *(src_buf + j) = rank + i;
      }

      ARMCI_PutS(
          src_buf,
          &src_stride,
          (void *) buffer[peer],
          &trg_stride,
          count,
          stride_level,
          peer);

      ARMCI_GetS(
          (void *) buffer[peer],
          &trg_stride,
          dst_buf,
          &src_stride,
          count,
          stride_level,
          peer);
    }

    ARMCI_Barrier();

    ARMCI_Access_begin(buffer[rank]);
    for (i = errors = 0; i < XDIM; i++) {
      for (j = 0; j < YDIM; j++) {
        const double actual   = *(buffer[rank] + i + j*XDIM);
        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS);
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);
          errors++;
          fflush(stdout);
        }
      }
    }
    ARMCI_Access_end(buffer[rank]);

    ARMCI_Free((void *) buffer[rank]);
    ARMCI_Free_local(src_buf);
    ARMCI_Free_local(dst_buf);
    free(buffer);

    ARMCI_Finalize();
    MPI_Finalize();

    if (errors == 0) {
      printf("%d: Success\n", rank);
      return 0;
    } else {
      printf("%d: Fail\n", rank);
      return 1;
    }
}