static void get_data(int n, int start, int end, double *vec_local,
                     double **vec) {
    int i, j, rc, bytes, offset;
    int proc_start, proc_end, idx_start, idx_end;

    proc_start = proc_end = -1;
    for(i=0; i<nproc; i++) {
        if(proc_start<0 && proc_row_list[i]>start) proc_start = i;
        if(proc_end<0 && proc_row_list[i]>end) proc_end = i;
    }
    if(proc_start<0 || proc_end<0) ARMCI_Error("Invalid Process Ids", -1);

    for(i=proc_start; i<=proc_end; i++) {
        if(i==proc_start) idx_start = start;
        else {
            if(i==0) idx_start=0;
            else idx_start = proc_row_list[i-1];
        }
        if(i==proc_end) idx_end = end;
        else idx_end = proc_row_list[i]-1;

        if(i!=prev_proc) {
            ++count;
            prev_proc = i;
            ARMCI_INIT_HANDLE(&gHandle[count]);
            ARMCI_SET_AGGREGATE_HANDLE(&gHandle[count]);
        }

        if(i==0) offset=0;
        else offset = proc_row_list[i-1];
        if(i==me) { /* local */
            for(j=idx_start; j<=idx_end; j++) vec_local[j] = vec[me][j-offset];
        }
        else {     /* remote */
            bytes = (idx_end-idx_start+1)*sizeof(double);
            vec_local[idx_start] = -1;
#if 0
            if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start],
                             bytes, i)))
#else
            if((rc=ARMCI_NbGet(&vec[i][idx_start-offset], &vec_local[idx_start],
                               bytes, i, &gHandle[count])))
#endif
                ARMCI_Error("armci_nbget failed\n",rc);
        }
    }
}
Exemple #2
0
void test_aggregate(int dryrun) {
  
    int i, j, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
    double *ddst_put[MAXPROC];
    double *ddst_get[MAXPROC];
    double *dsrc[MAXPROC];
    armci_hdl_t aggr_hdl_put[MAXPROC];
    armci_hdl_t aggr_hdl_get[MAXPROC];
    armci_hdl_t hdl_put[MAXELEMS];
    armci_hdl_t hdl_get[MAXELEMS];
    armci_giov_t darr;
    void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS];
    int start = 0, end = 0;
    double start_time;
        
    create_array((void**)ddst_put, sizeof(double),2, elems);
    create_array((void**)ddst_get, sizeof(double),2, elems);
    create_array((void**)dsrc, sizeof(double),1, &elems[1]);
    
    for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1);
    for(i=0; i<elems[0]*elems[1]; i++) {
      ddst_put[me][i]=0.0;
      ddst_get[me][i]=0.0;
    }
    
    MP_BARRIER();

    /* only proc 0 does the work */
    if(me == 0) {
      if(!dryrun)printf("Transferring %d doubles (Not an array of %d doubles)\n", MAXELEMS, MAXELEMS);
      
      /* initializing non-blocking handles */
      for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_put[i]);
      for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_get[i]);
      
      /* aggregate handles */
      for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_put[i]);
      for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_get[i]);
      for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_put[i]);
      for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_get[i]);    
      
      bytes = sizeof(double);
      
      /* **************** PUT **************** */    
      /* register put */
      start_time=MP_TIMER();
      start = 0; end = elems[1]; 
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i, 
				 &hdl_put[j]);
	}
	for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]);
      }
      if(!dryrun)printf("%d: Value Put time      = %.2es\n", me, MP_TIMER()-start_time);
 
      /* vector put */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {
	  src_ptr[j] = (void *)&dsrc[me][j];
	  dst_ptr[j] = (void *)&ddst_put[i][me*elems[1]+j];
	}
	darr.src_ptr_array = src_ptr;
	darr.dst_ptr_array = dst_ptr;
	darr.bytes = sizeof(double);
	darr.ptr_array_len = elems[1];
	if((rc=ARMCI_NbPutV(&darr, 1, i, &hdl_put[i])))
	  ARMCI_Error("armci_nbputv failed\n",rc);
      }
      for(i=1; i<nproc; i++) ARMCI_Wait(&hdl_put[i]);
      if(!dryrun)printf("%d: Vector Put time     = %.2es\n", me, MP_TIMER()-start_time);
      
      /* regular put */
      start_time=MP_TIMER();    
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes,
			     i, &hdl_put[j])))
	    ARMCI_Error("armci_nbput failed\n",rc);
	}
	for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]);
      }
      if(!dryrun)printf("%d: Regular Put time    = %.2es\n", me, MP_TIMER()-start_time);
      
      /* aggregate put */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes,
			     i,  &aggr_hdl_put[i])))
	    ARMCI_Error("armci_nbput failed\n",rc);
	}
      }
      for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_put[i]);
      if(!dryrun)printf("%d: Aggregate Put time  = %.2es\n\n", me, MP_TIMER()-start_time);
      
      
      /* **************** GET **************** */    
      
      /* vector get */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {
	  src_ptr[j] = (void *)&dsrc[i][j];
	  dst_ptr[j] = (void *)&ddst_get[me][i*elems[1]+j];
	}
	darr.src_ptr_array = src_ptr;
	darr.dst_ptr_array = dst_ptr;
	darr.bytes = sizeof(double);
	darr.ptr_array_len = elems[1];
	if((rc=ARMCI_NbGetV(&darr, 1, i, &hdl_get[i])))
	  ARMCI_Error("armci_nbgetv failed\n",rc);
	ARMCI_Wait(&hdl_get[i]);
      }
      if(!dryrun)printf("%d: Vector Get time     = %.2es\n", me, MP_TIMER()-start_time);
      
      /* regular get */
      start_time=MP_TIMER();    
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes,
			     i, &hdl_get[j])))
	    ARMCI_Error("armci_nbget failed\n",rc);
	}
	for(j=start; j<end; j++) ARMCI_Wait(&hdl_get[j]);
      }
      if(!dryrun)printf("%d: Regular Get time    = %.2es\n", me, MP_TIMER()-start_time);
      
      /* aggregate get */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes,
		      i, &aggr_hdl_get[i]);
	}
      }
      for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_get[i]);
      if(!dryrun)printf("%d: Aggregate Get time  = %.2es\n", me, MP_TIMER()-start_time);
    }

    MP_BARRIER();
    ARMCI_AllFence();
    MP_BARRIER();

    /* Verify */
    if(!(me==0))
      for(j=0; j<elems[1]; j++) {
	if( ARMCI_ABS(ddst_put[me][j]-j*1.001) > 0.1) {
	  ARMCI_Error("aggregate put failed...1", 0);
	}
      }
    MP_BARRIER();
    if(!dryrun)if(me==0) printf("\n  aggregate put ..O.K.\n"); fflush(stdout);

    if(me==0) {
      for(i=1; i<nproc; i++) {
	for(j=0; j<elems[1]; j++) {
	  if( ARMCI_ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) {
	    ARMCI_Error("aggregate get failed...1", 0);
	  }
	}
      }
    }
    MP_BARRIER();
    if(!dryrun)if(me==0) printf("  aggregate get ..O.K.\n"); fflush(stdout);


    ARMCI_AllFence();
    MP_BARRIER();
    
    if(!dryrun)if(me==0){printf("O.K.\n"); fflush(stdout);}
    destroy_array((void **)ddst_put);
    destroy_array((void **)ddst_get);
    destroy_array((void **)dsrc);
}
Exemple #3
0
int main(int argc, char *argv[])
{

    int rank, nranks;
    size_t i, msgsize, dest;
    size_t iterations, max_msgsize;
    int bufsize;
    double **buffer;
    double t_start, t_stop, t_total, d_total;
    double expected, bandwidth;
    int provided;
    armci_hdl_t handle;

    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    max_msgsize = MAX_MSGSIZE;
    ARMCI_Init_args(&argc, &argv);

    bufsize = max_msgsize * ITERATIONS;
    buffer = (double **) malloc(sizeof(double *) * nranks);
    ARMCI_Malloc((void **) buffer, bufsize);

    for (i = 0; i < bufsize / sizeof(double); i++)
    {
        *(buffer[rank] + i) = 1.0 + rank;
    }

    ARMCI_INIT_HANDLE(&handle);
    ARMCI_SET_AGGREGATE_HANDLE(&handle);

    ARMCI_Barrier();

    if (rank == 0)
    {

        printf("ARMCI_Get Bandwidth in MBPS \n");
        printf("%20s %22s \n", "Message Size", "Bandwidth");
        fflush(stdout);

        dest = 1;
        expected = 1 + dest;

        for (msgsize = sizeof(double); msgsize <= max_msgsize; msgsize *= 2)
        {

            iterations = bufsize/msgsize;

            t_start = MPI_Wtime();

            for (i = 0; i < iterations; i++)
            {

                ARMCI_NbGet((void *) ((size_t) buffer[dest] + (size_t)(i
                        * msgsize)), (void *) ((size_t) buffer[rank]
                        + (size_t)(i * msgsize)), msgsize, dest, &handle);
            }

            ARMCI_Wait(&handle);

            t_stop = MPI_Wtime();
            d_total = (iterations * msgsize) / (1024 * 1024);
            t_total = t_stop - t_start;
            bandwidth = d_total / t_total;
            printf("%20d %20.4lf \n", msgsize, bandwidth);
            fflush(stdout);

#ifdef DATA_VALIDATION 
            {
                for(j=0; j<((iterations*msgsize)/sizeof(double)); j++)
                {
                    if(*(buffer[rank] + j) != expected)
                    {
                        printf("Data validation failed At displacement : %d Expected : %lf Actual : %lf \n",
                                j, expected, *(buffer[rank] + j));
                        fflush(stdout);
                        return -1;
                    }
                }

                for(j=0; j<bufsize/sizeof(double); j++)
                {
                    *(buffer[rank] + j) = 1.0 + rank;
                }
            }
#endif

        }

    }

    ARMCI_Barrier();

    ARMCI_UNSET_AGGREGATE_HANDLE(&handle);

    ARMCI_Free((void *) buffer[rank]);

    ARMCI_Finalize();

    MPI_Finalize();

    return 0;
}
Exemple #4
0
int main(int argc, char *argv[])
{

    size_t i, rank, nranks, msgsize, dest;
    size_t iterations, max_msgsize;
    int bufsize;
    double **buffer;
    double t_start, t_stop, t_total, d_total;
    double expected, bandwidth;
    int provided;
    armci_hdl_t handle;

    max_msgsize = MAX_MSGSIZE;

    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    ARMCI_Init_args(&argc, &argv);

    bufsize = max_msgsize * ITERATIONS_LARGE;
    buffer = (double **) malloc(sizeof(double *) * nranks);
    ARMCI_Malloc((void **) buffer, bufsize);

    for (i = 0; i < bufsize / sizeof(double); i++)
    {
        *(buffer[rank] + i) = 1.0 + rank;
    }

    ARMCI_INIT_HANDLE(&handle);
    ARMCI_SET_AGGREGATE_HANDLE(&handle);

    ARMCI_Barrier();

    if (rank == 0)
    {

        printf("ARMCI_Put Bandwidth in MBPS \n");
        printf("%20s %22s \n", "Message Size", "Bandwidth");
        fflush(stdout);

        dest = 1;
        expected = 1 + dest;

        for (msgsize = sizeof(double); msgsize <= max_msgsize; msgsize *= 2)
        {

            if (msgsize <= 16 * 1024) iterations = ITERATIONS_VERYSMALL;
            else if (msgsize <= 64 * 1024) iterations = ITERATIONS_SMALL;
            else if (msgsize <= 512 * 1024) iterations = ITERATIONS_MEDIUM;
            else iterations = ITERATIONS_LARGE;

            t_start = MPI_Wtime();

            for (i = 0; i < iterations; i++)
            {

                ARMCI_NbPut((void *) ((size_t) buffer[dest] + (size_t)(i
                           * msgsize)), (void *) ((size_t) buffer[rank]
                           + (size_t)(i * msgsize)), msgsize, dest, &handle);

            }

            ARMCI_Wait(&handle);

            t_stop = MPI_Wtime();
            d_total = (iterations * msgsize) / (1024 * 1024);
            t_total = t_stop - t_start;
            bandwidth = d_total / t_total;
            printf("%20d %20.4lf \n", msgsize, bandwidth);
            fflush(stdout);
           
            ARMCI_Fence(dest);
        }

    }

    ARMCI_Barrier();

    ARMCI_UNSET_AGGREGATE_HANDLE(&handle);

    ARMCI_Free((void *) buffer[rank]);

    ARMCI_Finalize();

    MPI_Finalize(); 

    return 0;
}
Exemple #5
0
int main(int argc, char *argv[])
{

    int i, j, rank, nranks, msgsize, dest;
    int dim, iterations;
    long bufsize;
    double **buffer;
    double t_start, t_stop, t_total, d_total, bw;
    int count[2], src_stride, trg_stride, stride_level;
    int provided;
    armci_hdl_t handle;

    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    ARMCI_Init_args(&argc, &argv);

    bufsize = MAX_DIM * MAX_DIM * sizeof(double);
    buffer = (double **) malloc(sizeof(double *) * nranks);
    ARMCI_Malloc((void **) buffer, bufsize);

    for (i = 0; i < bufsize / sizeof(double); i++)
    {
        *(buffer[rank] + i) = 1.0 + rank;
    }

    ARMCI_INIT_HANDLE(&handle);
    ARMCI_SET_AGGREGATE_HANDLE(&handle);

    ARMCI_Barrier();

    if (rank == 0)
    {
        printf("ARMCI_PutS Bandwidth in MBPS \n");
        printf("%30s %22s \n", "Dimensions(array of doubles)", "Latency");
        fflush(stdout);

        dest = 1;

        src_stride = MAX_DIM * sizeof(double);
        trg_stride = MAX_DIM * sizeof(double);
        stride_level = 1;

        for (dim = 1; dim <= MAX_DIM; dim *= 2)
        {

            count[0] = dim*sizeof(double);
            count[1] = dim;
 
            iterations = 10*(MAX_DIM * MAX_DIM)/(dim * dim);

                t_start = MPI_Wtime();

                for (i = 0; i < iterations; i++)
                {

                    ARMCI_NbPutS((void *) buffer[rank],
                                  &src_stride,
                                  (void *) buffer[dest],
                                  &trg_stride,
                                  count,
                                  stride_level,
                                  dest,
                                  &handle);

                }
                ARMCI_Wait(&handle);
                t_stop = MPI_Wtime();
                ARMCI_Fence(1);

                char temp[10];
                sprintf(temp, "%dX%d", dim, dim);
                t_total = t_stop - t_start;
                d_total = (dim*dim*sizeof(double)*iterations)/(1024*1024);
                bw = d_total/t_total;
                printf("%30s %20.2f \n", temp, bw);
                fflush(stdout);

        }

    }

    ARMCI_Barrier();

    ARMCI_UNSET_AGGREGATE_HANDLE(&handle);

    ARMCI_Free((void *) buffer[rank]);

    ARMCI_Finalize();

    MPI_Finalize();

    return 0;

}