Beispiel #1
0
void test_groups() {
  
    int pid_listA[MAXPROC]  = {0,1,2};
    int pid_listB[MAXPROC] = {1,3};
    ARMCI_Group groupA, groupB;

    MP_BARRIER();

    ARMCI_Group_create(GNUM_A, pid_listA, &groupA); /* create group 1 */
    ARMCI_Group_create(GNUM_B, pid_listB, &groupB); /* create group 2 */


    /* ------------------------ GROUP A ------------------------- */ 
    if(chk_grp_membership(me, &groupA, pid_listA)) { /* group A */
      test_one_group(&groupA, pid_listA);
    }

    MP_BARRIER();
    
    /* ------------------------ GROUP B ------------------------- */ 
    if(chk_grp_membership(me, &groupB, pid_listB)) { /* group B */
      test_one_group(&groupB, pid_listB);
    }

    ARMCI_AllFence();
    MP_BARRIER();
    
    if(me==0){printf("O.K.\n"); fflush(stdout);}
}
Beispiel #2
0
int main(int argc, char* argv[])
{
    int ndim;

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

    if(me==0){
       printf("ARMCI test program for lock(%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    test_lock();

    MP_BARRIER();
    if(me==0){printf("test passed\n"); fflush(stdout);}
    sleep(2);

    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
Beispiel #3
0
void test_collective(const int datatype)
{
    char * op[7] = {"+", "*", "min", "max", "absmax", "absmin", "or"};
    int i = 0;
    int num_tests = 7;
    if(datatype == ARMCI_DOUBLE || datatype == ARMCI_FLOAT)
       num_tests = 6;
    
    /* test armci_msg_brdcst */
    test_brdcst(datatype);
    
    /* test armci_msg_gop2 */
    for(i = 0; i < num_tests; i++)
       test_gop2_or_reduce(datatype, op[i], 0);

    /* test armci_msg_reduce */
    for(i = 0; i < num_tests; i++)
       test_gop2_or_reduce(datatype, op[i], 1);

    MP_BARRIER();
    ARMCI_AllFence();
    MP_BARRIER();
    
    if(me==0){printf("O.K.\n\n"); fflush(stdout);}    
}
Beispiel #4
0
int main( int argc, char **argv)
{   
    
  MP_INIT(argc,argv);
  MP_MYID(&me);
  MP_PROCS(&nproc);
    
    if(nproc < 2) {
        if(me == 0)
            fprintf(stderr,
                    "USAGE: 2 <= processes < %d\n", nproc);
        MP_BARRIER();
        MP_FINALIZE();
        exit(0);
    }

    if(me == 0){
       printf("Test of ARMCI Wrappers to Basic Message Passing Operations\n");
       fflush(stdout);
    }
   
    /* initialize ARMCI */
    ARMCI_Init();

    MP_BARRIER();
   
    TestGlobals();
   
    /* done */
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}   
Beispiel #5
0
void destroy_safe_array()
{
    int rc;
    MP_BARRIER();
    MPI_Win_unlock_all(win);
    MPI_Win_free(&win);
    MP_BARRIER();
}
Beispiel #6
0
void test_lock()
{
int i,mut;
    if(me==0)printf("\n");
    for(mut=0;mut<16;mut++)
      for(i=0;i<nproc;i++){ 
        armcill_lock(mut,i);
        armcill_unlock(mut,i);
        MP_BARRIER();
        if(me==0){printf(".");fflush(stdout);}
        MP_BARRIER();
      }
}
Beispiel #7
0
int main(int argc, char* argv[])
{
    int ndim;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
    MPI_Comm_rank(MPI_COMM_WORLD, &me);

    if(me==0){
       printf("MPI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }

    if(me==0){
       printf("\nTesting strided gets and puts\n");
       printf("(Only std output for process 0 is printed)\n\n"); 
       fflush(stdout);
       sleep(1);
    }

    for(ndim=1; ndim<= MAXDIMS; ndim++) test_dim(ndim);
    MP_BARRIER();

    MPI_Finalize();
    return(0);
}
Beispiel #8
0
void test_groups_noncollective() {
  int *pid_lists[MAX_GROUPS];
  int pids[MAXPROC];
  int i, nprocs, world_me;
  ARMCI_Group group;
  int *my_pid_list=NULL, my_grp_size=0;
  int ngrps;

  MP_BARRIER();
  MP_PROCS(&nprocs);
  MP_MYID(&world_me);

  random_permute(pids, nproc);

  ngrps = nprocs/GROUP_SIZE;
  
  for(i=0; i<nprocs/GROUP_SIZE; i++) {
    pid_lists[i] = pids + (i*GROUP_SIZE);
  }

  for(i=0; i<nprocs; i++) {
    if(pids[i] == world_me) {
      int grp_id = ARMCI_MIN(i/GROUP_SIZE, ngrps-1);
      my_pid_list = pid_lists[grp_id];
      if(grp_id == ngrps-1)
	my_grp_size =  GROUP_SIZE + (nprocs%GROUP_SIZE);
      else
	my_grp_size = GROUP_SIZE;
    }
  }

  qsort(my_pid_list, my_grp_size, sizeof(int), int_compare);
  
  MP_BARRIER();
  /*now create all these disjoint groups and test them in parallel*/
  
  ARMCI_Group_create(my_grp_size, my_pid_list, &group);

  test_one_group(&group, my_pid_list);

  ARMCI_Group_free(&group);

  ARMCI_AllFence();
  MP_BARRIER();
  
  if(world_me==0){printf("O.K.\n"); fflush(stdout);}
}
Beispiel #9
0
void verify_results(int op, int *elems) {
    int i, j;
      
    switch(op) {

    case PUT:
      if(!(me==0))
	for(j=0; j<elems[1]; j++) {
	  if( ARMCI_ABS(ddst[me][j]-j*1.001) > 0.1) {
	    ARMCI_Error("put failed...Invalid Value Obtained..1", 0);
	  }
	}
      MP_BARRIER();
      if(DEBUG) if(me==0) printf("  verifying put ..O.K.\n");
      break;
      
    case GET:
      if(me==0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<elems[1]; j++) {
	    if( ARMCI_ABS(ddst[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) 
	      ARMCI_Error("get failed...Invalid Value Obtained..1", 0);
	  }
	}
      }
      MP_BARRIER();
      if(DEBUG) if(me==0) printf("  verifying get ..O.K.\n\n");
      break;
      
    case ACC: 
      if(me==0)
	for(j=0; j<elems[1]; j++) {
	  /*printf("ddst[%d][%d] = %lf\n", me, j, ddst[me][j]);
	    fflush(stdout); */
	  if( ARMCI_ABS(ddst[me][j]-(double)nproc) > 0.1) {
	    ARMCI_Error("accumulate failed...Invalid Value Obtained..1", 0);
	  }
	}
      MP_BARRIER();
      if(DEBUG)if(me==0) printf("  verifying accumulate ..O.K.\n"); 
      break;
      
    default:
      ARMCI_Error("Invalid Operation", 0);
    }
    fflush(stdout);
}
Beispiel #10
0
int main(int argc, char* argv[])
{

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

/*    printf("nproc = %d, me = %d\n", nproc, me);*/
    
    if( (nproc<MINPROC || nproc>MAXPROC) && me==0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\n Testing ARMCI Groups!\n\n");
      fflush(stdout);
    }

    test_groups();
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\n Collective groups: Success!!\n"); fflush(stdout);}
    sleep(2);

#ifdef ARMCI_GROUP
    test_groups_noncollective();

    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\n Non-collective groups: Success!!\n"); fflush(stdout);}
    sleep(2);
#endif
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
Beispiel #11
0
void usage()
{
    if (!rank) {
        printf("Usage: test_mt, or \n");
        printf("       test_mt -tTHREADS_PER_PROC -sARRAY_SIZE -iITERATIONS_COUNT\n");
    }
    MP_BARRIER();
    MP_FINALIZE();
    exit(0);
}
Beispiel #12
0
int main(int argc, char* argv[])
{
ARMCI_NetInit();

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

    if(nproc < 2 || nproc> MAXPROC) {
      if(me == 0)
	fprintf(stderr,
		"USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc);
      MP_BARRIER();
      MP_FINALIZE();
      exit(0);
    }

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\n put/get/acc requests (Time in secs)\n\n");
      fflush(stdout);
    }

    test_perf_nb(1);
    test_perf_nb(0);
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}
    sleep(2);
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
Beispiel #13
0
int main(int argc, char **argv)
{

    ARMCI_NetInit();
  MP_INIT(argc,argv);
  MP_MYID(&me);
  MP_PROCS(&nproc);

    if(nproc < 2 || nproc> MAXPROC) {
        if(me == 0)
            fprintf(stderr,
                    "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc);
        MP_BARRIER();
        MP_FINALIZE();
        exit(0);
    }
    
    /* initialize ARMCI */
    ARMCI_Init();

    if(!me)printf("\n             Performance of Basic Blocking Communication Operations\n");
    MP_BARRIER();
    
    CHECK_RESULT=1;
    if(!me)printf("\n\t\t\tContiguous Data Transfer\n");
    test_1D();
    CHECK_RESULT=0;

    /* test 1 dimension array */
    if(!me)printf("\n\t\t\tContiguous Data Transfer\n");
    test_1D();
    
    /* test 2 dimension array */
    if(!me)printf("\n\t\t\tStrided Data Transfer\n");
    test_2D();

    MP_BARRIER();
    if(me == 0){
       if(warn_accuracy) 
          printf("\nWARNING: Your timer does not have sufficient accuracy for this test (%d)\n",warn_accuracy);
       printf("\n\n------------ Testing the same data transfer for correctness ----------\n");
       fflush(stdout);
    }

    MP_BARRIER();
    CHECK_RESULT=1;
    if(!me)printf("\n\t\t\tContiguous Data Transfer\n");
    test_1D();
    if(me == 0) printf("OK\n");
    MP_BARRIER();
    if(!me)printf("\n\t\t\tStrided Data Transfer\n");
    test_2D();
    if(me == 0) printf("OK\n\n\nTests Completed.\n");
    MP_BARRIER();

    /* done */
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}    
Beispiel #14
0
void create_safe_array(void **a, int elem_size, int ndim, int dims[])
{
     int bytes=elem_size, i;
     void * base;

     assert(ndim<=MAXDIMS);
     for(i=0;i<ndim;i++)bytes*=dims[i];
     // a[me] = malloc(bytes);
     MPI_Win_allocate(bytes, 1, MPI_INFO_NULL, MPI_COMM_WORLD, (void *)&base, &win);
     MPI_Win_lock_all(MPI_MODE_NOCHECK, win);
     assert(base);
     *a = base;
     MP_BARRIER();
}
Beispiel #15
0
int main(int argc, char* argv[])
{

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

/*    printf("nproc = %d, me = %d\n", nproc, me);*/
    
    if(nproc>MAXPROC && me==0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\nAggregate put/get requests\n\n");
      fflush(stdout);
    }
    test_aggregate(1); /* cold start */
    test_aggregate(0); /* warm start */
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}
    sleep(2);
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
Beispiel #16
0
int main(int argc, char* argv[])
{
    int i;
    struct timeval start_time[14];
    struct timeval stop_time[14];
    /*
      char * test_name[14] = {
      "dim", "nbdim", "vec_small", "acc",
      "vector", "vector_acc", "fetch_add",
      "swap", "rput", "aggregate", "implicit",
      "memlock", "acc_type", "collective"
      };
      int test_flags[14] = {
      1, 1, 1, 1,
      1, 1, 1,
      1, 1, 0, 1,
      1, 1, 1
      };
    */
    char * test_name[2] = { "acc_type", "collective" };
    int test_flags[2]   = { 1, 1 };

#define TEST_ACC_TYPE   0
#define TEST_COLLECTIVE 1

    MP_INIT(argc, argv);
    ARMCI_Init();
    MP_PROCS(&nproc);
    MP_MYID(&me);

    if(nproc > MAXPROC && me == 0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me == 0)
    {
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }    

    gettimeofday(&start_time[TEST_ACC_TYPE],NULL);
    if(test_flags[TEST_ACC_TYPE] == 1)
    {
       if(me == 0)
       {
          printf("\nTesting Accumulate Types\n");
          fflush(stdout);
       }
       
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_INT\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_INT);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_LNG\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_LNG);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_FLT\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_FLT);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_DBL\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_DBL);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_CPL\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_CPL);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_DCP\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_DCP);
       ARMCI_AllFence();
       MP_BARRIER();
    }
    gettimeofday(&stop_time[TEST_ACC_TYPE],NULL);

    gettimeofday(&start_time[TEST_COLLECTIVE],NULL);
    if(test_flags[TEST_COLLECTIVE] == 1)
    {
       if(me == 0)
       {
          printf("\nTesting Collective Types\n");
          fflush(stdout);
       }
       if(me == 0)
       {
          printf("Test Collective ARMCI_INT\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_INT);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_LONG\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_LONG);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_FLOAT\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_FLOAT);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_DOUBLE\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_DOUBLE);
       MP_BARRIER();
    }
    gettimeofday(&stop_time[TEST_COLLECTIVE],NULL);
    
    if(me == 0)
    {
       printf("Accumulate and Collective tests passed\n");
       fflush(stdout);
    }

    if(me == 0)
    {
       printf("Testcase runtime\n");
       printf("Name,Time(seconds)\n");
       for(i = 0; i < 2; i++)
          if(test_flags[i] == 1)
          {
             double time_spent = (stop_time[i].tv_sec - start_time[i].tv_sec) + ((double) stop_time[i].tv_usec - start_time[i].tv_usec) / 1E6;
             printf("%s,%.6f\n", test_name[i], time_spent);
          }
    }

    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
Beispiel #17
0
void test_acc_type(const int datatype)
{
    int i = 0;
    int datatype_size = 0;
    void * scale;
    void * a;
    void *b[MAXPROC];
    int elems = ELEMS;
    int dim = 1;
    int count = 0;
    int strideA = 0;
    int strideB = 0;

    switch(datatype)
    {
       case ARMCI_ACC_INT:
          datatype_size = sizeof(int);
          scale = malloc(datatype_size);
          *((int *) scale) = 1;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((int *) a)[i] = i + me;
             ((int *) b[me])[i] = 0;
          }
          break;
       case ARMCI_ACC_LNG:
          datatype_size = sizeof(long);
          scale = malloc(datatype_size);
          *((long *) scale) = 1;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((long *) a)[i] = i + me;
             ((long *) b[me])[i] = 0;
          }
          break;
       case ARMCI_ACC_FLT:
          datatype_size = sizeof(float);
          scale = malloc(datatype_size);
          *((float *) scale) = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((float *) a)[i] = (float) i + me;
             ((float *) b[me])[i] = 0.0;
          }
          break;
       case ARMCI_ACC_DBL:
          datatype_size = sizeof(double);
          scale = malloc(datatype_size);
          *((double *) scale) = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((double *) a)[i] = (double) i + me;
             ((double *) b[me])[i] = 0.0;
          }
          break;
       case ARMCI_ACC_CPL:
          datatype_size = sizeof(cmpl_t);
          scale = malloc(datatype_size);
          ((cmpl_t *) scale)->real = 2.0;
          ((cmpl_t *) scale)->imag = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((cmpl_t *) a)[i].real = ((float) i + me);
             ((cmpl_t *) a)[i].imag = ((float) i + me);
             ((cmpl_t *) b[me])[i].real = 0.0;
             ((cmpl_t *) b[me])[i].imag = 0.0;
          }
          break;
       case ARMCI_ACC_DCP:
          datatype_size = sizeof(dcmpl_t);
          scale = malloc(datatype_size);
          ((dcmpl_t *) scale)->real = 2.0;
          ((dcmpl_t *) scale)->imag = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((dcmpl_t *) a)[i].real = ((double) i + me);
             ((dcmpl_t *) a)[i].imag = ((double) i + me);
             ((dcmpl_t *) b[me])[i].real = 0.0;
             ((dcmpl_t *) b[me])[i].imag = 0.0;
          }
          break;
       default:
          return;
          break;
    }

    count = elems * datatype_size;
    strideA = elems * datatype_size;
    strideB = elems * datatype_size;

    ARMCI_AllFence();
    MP_BARRIER();

    for(i = 0; i < nproc; i++)
       ARMCI_AccS(datatype, scale, a, &strideA, b[(me + i) % nproc], &strideB, &count, 0, (me + i) % nproc);

    ARMCI_AllFence();
    MP_BARRIER();

    switch(datatype)
    {
       case ARMCI_ACC_INT:
          for(i = 0; i < elems; i++)
          {
             int compare = (i * nproc) + nproc / 2 * (nproc - 1);
             if(((int *)b[me])[i] != compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_INT [%d] = %d != %d\n", i, ((int *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_LNG:
          for(i = 0; i < elems; i++)
          {
             long compare = (i * nproc) + nproc / 2 * (nproc - 1);
             if(((long *)b[me])[i] != compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_LNG [%d] = %d != %ld\n", i, ((int *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_FLT:
          for(i = 0; i < elems; i++)
          {
             float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((float *)b[me])[i] != compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_FLT [%d] = %f != %f\n", i, ((float *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_DBL:
          for(i = 0; i < elems; i++)
          {
             double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((double *)b[me])[i] != (double) ((i * nproc) + nproc / 2 * (nproc - 1))) 
             {
                printf("ERROR accumulate ARMCI_ACC_DBL [%d] = %f != %f \n", i, ((double *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_CPL:
          for(i = 0; i < elems; i++)
          {
             float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((cmpl_t *)b[me])[i].real != compare && ((cmpl_t *)b[me])[i].imag != 3 * compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_CPL [%d] = %f + %fj != %f + %fj\n", i, ((cmpl_t *)b[me])[i].real, ((cmpl_t *)b[me])[i].imag, compare, 3 * compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_DCP:
          for(i = 0; i < elems; i++)
          {
             double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((dcmpl_t *)b[me])[i].real != compare && ((dcmpl_t *)b[me])[i].imag != 3 * compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_DCP [%d] = %f + %fj != %f + %fj\n", i, ((dcmpl_t *)b[me])[i].real, ((dcmpl_t *)b[me])[i].imag, compare, 3 * compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       default:
          break;
    }

    MP_BARRIER();
    ARMCI_AllFence();
    MP_BARRIER();
    
    if(me==0){printf("O.K.\n\n"); fflush(stdout);}    
    destroy_array((void**)b);
    free(a);
    free(scale);
}
Beispiel #18
0
main(int argc, char *argv[])
{
  int i, j;
  int ch;
  extern char *optarg;
  int edge;
  int size;
    
  /* ARMCI */
  void **ptr;
  double **ptr_loc;
  void **bufr_g, **bufc_g;

  MP_INIT(arc,argv);
  MP_PROCS(&nproc);
  MP_MYID(&me);
    
  while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
    switch(ch) {
    case 'n': n = atoi(optarg); break;
    case 'b': block_size = atoi(optarg); break;
    case 'p': nproc = atoi(optarg); break;
    case 'h': {
      printf("Usage: LU, or \n");
      printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
      MP_BARRIER();
      MP_FINALIZE();
      exit(0);
    }            
    }
  }
    
  if(me == 0) {
    printf("\nUsing pre-PUTing\n");
    printf("\n Blocked Dense LU Factorization\n");
    printf("     %d by %d Matrix\n", n, n);
    printf("     %d Processors\n", nproc);
    printf("     %d by %d Element Blocks\n", block_size, block_size);
    printf("\n");
  }
    
  num_rows = (int) sqrt((double) nproc);
  for (;;) {
    num_cols = nproc/num_rows;
    if (num_rows*num_cols == nproc)
      break;
    num_rows--;
  }
    
  nblocks = n/block_size;
  if (block_size * nblocks != n) {
    nblocks++;
  }
    
  edge = n%block_size;
  if (edge == 0) {
    edge = block_size;
  }
    
  #ifdef DEBUG
  if(me == 0)
    for (i=0;i<nblocks;i++) {
      for (j=0;j<nblocks;j++) 
	printf("%d ", block_owner(i, j));
      printf("\n");
    }
  MP_BARRIER();
  MP_FINALIZE();
  exit(0);
  #endif
    
  for (i=0;i<nblocks;i++) {
    for (j=0;j<nblocks;j++) {
      if(block_owner(i,j) == me) {
	if ((i == nblocks-1) && (j == nblocks-1)) {
	  size = edge*edge;
	}
	else if ((i == nblocks-1) || (j == nblocks-1)) {
	  size = edge*block_size;
	}
	else {
	  size = block_size*block_size;
	}
	proc_bytes += size*sizeof(double);
      }
    }
  }
    
  /* initialize ARMCI */
  ARMCI_Init();
  ptr = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(ptr, proc_bytes);
  
  a = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *));
  if (a == NULL) {
    fprintf(stderr, "Could not malloc memory for a\n");
    exit(-1);
  } 
  ptr_loc = (double **)ARMCI_Malloc_local(nproc*sizeof(double *));
  for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
  for(i=0; i<nblocks;i ++) {
    for(j=0; j<nblocks; j++) {
      a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
      if ((i == nblocks-1) && (j == nblocks-1)) {
	size = edge*edge;
      } else if ((i == nblocks-1) || (j == nblocks-1)) {
	size = edge*block_size;
      } else {
	size = block_size*block_size;
      }
      ptr_loc[block_owner(i, j)] += size;
    }
  }
    
  /* initialize the array */
  init_array();
  
  bufr = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *));
  bufc = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *));

  if (bufr == NULL || bufc == NULL)
    printf("Could not ARMCI_Malloc_local() mem\n");
  /* bufr points to all k-th row blocks */
  /* save all block address in row-major order */
  proc_bytes = nblocks*block_size*block_size * sizeof(double);
  bufr_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(bufr_g, proc_bytes);

  for (i = 0; i < nproc; i++) {
    bufr[i*nblocks] = (double *) bufr_g[i];
    for (j = 1; j < nblocks; j++) {
      bufr[i*nblocks + j]  = bufr[i*nblocks + j-1] + block_size * block_size;
    }
  }

  /* bufc points to all k-th column blocks */
  bufc_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(bufc_g, proc_bytes);

  for (i = 0; i < nproc; i++) {
    bufc[i*nblocks] = (double *) bufc_g[i];
    for (j = 1; j < nblocks; j++) {
      bufc[i*nblocks + j]  = bufc[i*nblocks + j-1] + block_size * block_size;
    }
  }

  /* barrier to ensure all initialization is done */
  MP_BARRIER();

  /* to remove cold-start misses, all processors touch their own data */
  touch_array(block_size, me);
  MP_BARRIER();

  if(doprint) {
    if(me == 0) {
      printf("Matrix before LU decomposition\n");
      print_array(me); 
    }
    MP_BARRIER();
  }  

  /* Starting the timer */
  if(me == 0) start_timer();

  lu(n, block_size, me);
  
  MP_BARRIER();

  /* Timer Stops here */
  if(me == 0) 
  printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time());

  if(doprint) {        
    if(me == 0) {
      printf("after LU\n");
      print_array(me);
    }
    MP_BARRIER();
  }
    
  /* done */
  ARMCI_Free(ptr[me]);
  ARMCI_Free(bufc_g[me]);
  ARMCI_Free(bufr_g[me]);
  ARMCI_Finalize();
  MP_FINALIZE();
}
Beispiel #19
0
void destroy_array(void *ptr[])
{
    MP_BARRIER();

    assert(!ARMCI_Free(ptr[me]));
}
Beispiel #20
0
void test_aggregate(int dryrun) {
  
    int i, j, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
    double *ddst_put[MAXPROC];
    double *ddst_get[MAXPROC];
    double *dsrc[MAXPROC];
    armci_hdl_t aggr_hdl_put[MAXPROC];
    armci_hdl_t aggr_hdl_get[MAXPROC];
    armci_hdl_t hdl_put[MAXELEMS];
    armci_hdl_t hdl_get[MAXELEMS];
    armci_giov_t darr;
    void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS];
    int start = 0, end = 0;
    double start_time;
        
    create_array((void**)ddst_put, sizeof(double),2, elems);
    create_array((void**)ddst_get, sizeof(double),2, elems);
    create_array((void**)dsrc, sizeof(double),1, &elems[1]);
    
    for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1);
    for(i=0; i<elems[0]*elems[1]; i++) {
      ddst_put[me][i]=0.0;
      ddst_get[me][i]=0.0;
    }
    
    MP_BARRIER();

    /* only proc 0 does the work */
    if(me == 0) {
      if(!dryrun)printf("Transferring %d doubles (Not an array of %d doubles)\n", MAXELEMS, MAXELEMS);
      
      /* initializing non-blocking handles */
      for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_put[i]);
      for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_get[i]);
      
      /* aggregate handles */
      for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_put[i]);
      for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_get[i]);
      for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_put[i]);
      for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_get[i]);    
      
      bytes = sizeof(double);
      
      /* **************** PUT **************** */    
      /* register put */
      start_time=MP_TIMER();
      start = 0; end = elems[1]; 
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i, 
				 &hdl_put[j]);
	}
	for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]);
      }
      if(!dryrun)printf("%d: Value Put time      = %.2es\n", me, MP_TIMER()-start_time);
 
      /* vector put */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {
	  src_ptr[j] = (void *)&dsrc[me][j];
	  dst_ptr[j] = (void *)&ddst_put[i][me*elems[1]+j];
	}
	darr.src_ptr_array = src_ptr;
	darr.dst_ptr_array = dst_ptr;
	darr.bytes = sizeof(double);
	darr.ptr_array_len = elems[1];
	if((rc=ARMCI_NbPutV(&darr, 1, i, &hdl_put[i])))
	  ARMCI_Error("armci_nbputv failed\n",rc);
      }
      for(i=1; i<nproc; i++) ARMCI_Wait(&hdl_put[i]);
      if(!dryrun)printf("%d: Vector Put time     = %.2es\n", me, MP_TIMER()-start_time);
      
      /* regular put */
      start_time=MP_TIMER();    
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes,
			     i, &hdl_put[j])))
	    ARMCI_Error("armci_nbput failed\n",rc);
	}
	for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]);
      }
      if(!dryrun)printf("%d: Regular Put time    = %.2es\n", me, MP_TIMER()-start_time);
      
      /* aggregate put */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes,
			     i,  &aggr_hdl_put[i])))
	    ARMCI_Error("armci_nbput failed\n",rc);
	}
      }
      for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_put[i]);
      if(!dryrun)printf("%d: Aggregate Put time  = %.2es\n\n", me, MP_TIMER()-start_time);
      
      
      /* **************** GET **************** */    
      
      /* vector get */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {
	  src_ptr[j] = (void *)&dsrc[i][j];
	  dst_ptr[j] = (void *)&ddst_get[me][i*elems[1]+j];
	}
	darr.src_ptr_array = src_ptr;
	darr.dst_ptr_array = dst_ptr;
	darr.bytes = sizeof(double);
	darr.ptr_array_len = elems[1];
	if((rc=ARMCI_NbGetV(&darr, 1, i, &hdl_get[i])))
	  ARMCI_Error("armci_nbgetv failed\n",rc);
	ARMCI_Wait(&hdl_get[i]);
      }
      if(!dryrun)printf("%d: Vector Get time     = %.2es\n", me, MP_TIMER()-start_time);
      
      /* regular get */
      start_time=MP_TIMER();    
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes,
			     i, &hdl_get[j])))
	    ARMCI_Error("armci_nbget failed\n",rc);
	}
	for(j=start; j<end; j++) ARMCI_Wait(&hdl_get[j]);
      }
      if(!dryrun)printf("%d: Regular Get time    = %.2es\n", me, MP_TIMER()-start_time);
      
      /* aggregate get */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes,
		      i, &aggr_hdl_get[i]);
	}
      }
      for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_get[i]);
      if(!dryrun)printf("%d: Aggregate Get time  = %.2es\n", me, MP_TIMER()-start_time);
    }

    MP_BARRIER();
    ARMCI_AllFence();
    MP_BARRIER();

    /* Verify */
    if(!(me==0))
      for(j=0; j<elems[1]; j++) {
	if( ARMCI_ABS(ddst_put[me][j]-j*1.001) > 0.1) {
	  ARMCI_Error("aggregate put failed...1", 0);
	}
      }
    MP_BARRIER();
    if(!dryrun)if(me==0) printf("\n  aggregate put ..O.K.\n"); fflush(stdout);

    if(me==0) {
      for(i=1; i<nproc; i++) {
	for(j=0; j<elems[1]; j++) {
	  if( ARMCI_ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) {
	    ARMCI_Error("aggregate get failed...1", 0);
	  }
	}
      }
    }
    MP_BARRIER();
    if(!dryrun)if(me==0) printf("  aggregate get ..O.K.\n"); fflush(stdout);


    ARMCI_AllFence();
    MP_BARRIER();
    
    if(!dryrun)if(me==0){printf("O.K.\n"); fflush(stdout);}
    destroy_array((void **)ddst_put);
    destroy_array((void **)ddst_get);
    destroy_array((void **)dsrc);
}
Beispiel #21
0
int main(int argc, char **argv) {
  int i;
  double **myptrs;
  double t0, t1, tnbget=0, tnbwait=0, t2=0;

  MP_INIT(argc,argv);
  ARMCI_Init();

  MP_PROCS(&nprocs);
  MP_MYID(&me);

  if (nprocs < 2)
    ARMCI_Error("This program requires at least to processes", 1);

  myptrs = (double **)malloc(sizeof(double *)*nprocs);
  ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); 
  
  MP_BARRIER();
  
  if(me == 0) {
    for(i = 0; i < 10; i++) {
      // This is a bug:
      // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1);
      ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1);
    }

    t0 = MP_TIMER(); 
    for(i = 0; i < LOOP; i++) {
      // This is a bug:
      // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1);
      ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1);
    }
    t1 = MP_TIMER(); 

    printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP);
    fflush(stdout);

    t1 = t0 = 0;

    for(i = 0; i < LOOP; i++) {
      armci_hdl_t nbh;
      ARMCI_INIT_HANDLE(&nbh);

      t0 = MP_TIMER(); 
      //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh);
      ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh);
      t1 = MP_TIMER(); 
      ARMCI_Wait(&nbh);
      t2 = MP_TIMER();

      tnbget  += (t1-t0);
      tnbwait += (t2-t1);
    }

    printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout);
  }

  else
    sleep(1);

  MP_BARRIER();

  ARMCI_Finalize();
  MP_FINALIZE();

  return 0;
}
Beispiel #22
0
void test_perf_nb(int dry_run) {
  
    int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
    int stride, k=0, ntimes;
    double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9;
    double *dsrc[MAXPROC], scale=1.0;
    armci_hdl_t hdl_get, hdl_put, hdl_acc;
        
    create_array((void**)ddst, sizeof(double),2, elems);
    create_array((void**)dsrc, sizeof(double),1, &elems[1]);

    if(!dry_run)if(me == 0) {
      printf("\n\t\t\tRemote 1-D Array Section\n");
      printf("section    get      nbget    wait     put     nbput  ");
      printf("   wait     acc     nbacc     wait\n");
      printf("-------  -------- -------- -------- -------- --------");
      printf(" -------- -------- -------- --------\n");
      fflush(stdout);
    }

    for(loop=1; loop<=MAXELEMS; loop*=2, k++) {

      elems[1] = loop;
      ntimes = (int)sqrt((double)(MAXELEMS/elems[1]));
      if(ntimes <1) ntimes=1;

      /* -------------------------- SETUP --------------------------- */
      /*initializing non-blocking handles,time,src & dst buffers*/
      ARMCI_INIT_HANDLE(&hdl_put);
      ARMCI_INIT_HANDLE(&hdl_get);
      ARMCI_INIT_HANDLE(&hdl_acc);
      t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0;
      for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;    
      MP_BARRIER();
      
      /* bytes transfered */
      bytes = sizeof(double)*elems[1]; 
      MP_BARRIER();
      
      /* -------------------------- PUT/GET -------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	  t1 += MP_TIMER()-stime;
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();
      
      if(me == 0) { 
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();    
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	  t4 += MP_TIMER()-stime;	
	}
      }    
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      /* ------------------------ nb PUT/GET ------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,
			       i, &hdl_put)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	    t2 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_put);
	    t3 += MP_TIMER()-stime;
	  } 
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,
			       i, &hdl_get)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	    t5 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_get);
	    t6 += MP_TIMER()-stime;
	  }
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; 
      MP_BARRIER();


      /* ------------------------ Accumulate ------------------------- */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();
	if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			  &ddst[0][0], &stride, &bytes, 0, 0)))
	  ARMCI_Error("armci_acc failed\n",rc);
	t7 += MP_TIMER()-stime;
	
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }

#if 1
      /* See the note below why this part is disabled */
      /* ---------------------- nb-Accumulate ------------------------ */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();    
	if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			    &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc)))
	  ARMCI_Error("armci_nbacc failed\n",rc);
	t8 += MP_TIMER()-stime; stime=MP_TIMER();
	ARMCI_Wait(&hdl_acc);
	t9 += MP_TIMER()-stime;
      
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }
#endif

      /* print timings */
     if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", 
		       bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, 
		       t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes);
    }

    ARMCI_AllFence();
    MP_BARRIER();
    
    if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);}
    destroy_array((void **)ddst);
    destroy_array((void **)dsrc);
}
Beispiel #23
0
void test_dim(int ndim)
{
	int dim,elems;
	int i,j, proc;
	/* double a[DIM4][DIM3][DIM2][DIM1], b[EDIM4][EDIM3][EDIM2][EDIM1];*/
        double *b;
        double *a, *a1, *a2, *c;
        int ridx;
        MPI_Datatype typeA, typeB;
        int rstrideB[MAXDIMS];
        int rcount[MAXDIMS];
        int pidx1 = -1, pidx2 = -1, pidx3 = -1;

	elems = 1;   
        strideA[0]=sizeof(double); 
        strideB[0]=sizeof(double);
	for(i=0;i<ndim;i++){
		strideA[i] *= dimsA[i];
		strideB[i] *= dimsB[i];
                if(i<ndim-1){
                     strideA[i+1] = strideA[i];
                     strideB[i+1] = strideB[i];
                }
		elems *= dimsA[i];
	}

        /* create shared and local arrays */
        create_safe_array((void**)&b, sizeof(double),ndim,dimsB);
        a1 = (double *)malloc(sizeof(double)*elems);
        assert(a1);
        a2 = (double *)malloc(sizeof(double)*elems);
        assert(a2);
        c = (double *)malloc(sizeof(double)*elems);
        assert(c);

	init(a1, ndim, elems, dimsA, me!=0, 0);
	init(a2, ndim, elems, dimsA, me!=0, 1);
	
	if(me==0){
            printf("--------array[%d",dimsA[0]);
	    for(dim=1;dim<ndim;dim++)printf(",%d",dimsA[dim]);
	    printf("]--------\n");
        }
        sleep(1);

        MP_BARRIER();
	for(i=0;i<LOOP;i++){
	    int idx1, idx2, idx3, ridx;
            MPI_Request request;
            if (i%2) {
               a = a2;
            } else {
               a = a1;
            }
	    get_range(ndim, dimsA, loA, hiA);
	    new_range(ndim, dimsB, loA, hiA, loB, hiB);
	    new_range(ndim, dimsA, loA, hiA, loC, hiC);

            proc=nproc-1-me;

            if(me==0){
	       print_range("local",ndim,loA, hiA,"-> ");
	       print_range("remote",ndim,loB, hiB,"-> ");
	       print_range("local",ndim,loC, hiC,"\n");
            }

	    idx1 = Index(ndim, loA, dimsA);
	    idx2 = Index(ndim, loB, dimsB);
	    idx3 = Index(ndim, loC, dimsA);

            MPI_Sendrecv(&idx2, 1, MPI_INT, proc, 666, &ridx, 1, MPI_INT, proc, 666, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

	    for(j=0;j<ndim;j++)count[j]=hiA[j]-loA[j]+1;

	    count[0]   *= sizeof(double); /* convert range to bytes at stride level zero */

            Strided_to_dtype(strideA, count, ndim-1, MPI_BYTE, &typeA);
            MPI_Type_commit(&typeA);
            Strided_to_dtype(strideB, count, ndim-1, MPI_BYTE, &typeB);
            MPI_Type_commit(&typeB);

            MPI_Accumulate(a + idx1, 1, typeA, proc, (MPI_Aint)(idx2*sizeof(double)), 1, typeB, MPI_REPLACE, win);
            MP_FLUSH(proc);

            /* note that we do not need Fence here since
             * consectutive operations targeting the same process are ordered */

            MPI_Get_accumulate(NULL, 0, MPI_BYTE, c + idx3, 1, typeA, proc,
                         (MPI_Aint)(idx2*sizeof(double)), 1, typeB, MPI_NO_OP, win);
            MP_FLUSH(proc);

            
            compare_patches(0., ndim, a+idx1, loA, hiA, dimsA, c+idx3, loC, hiC, dimsA);

            pidx1 = idx1;
            pidx2 = idx2;
            pidx3 = idx3; 

            MPI_Type_free(&typeA);
            MPI_Type_free(&typeB);
        }

        free(c);
        destroy_safe_array();
        free(a);
}
Beispiel #24
0
void read_and_create(int argc, char **argv)
{
int ri,i,nread;
int tmp1,idealelementsperproc;
void **amatptrs,**xvecptrs;

    na = atoi(argv[1]);
    nz = atoi(argv[2]);

    if(strncmp("random",argv[3],6)){
       if(me==0){
         fd = fopen(argv[3], "r");
         if(fd==NULL)ARMCI_Error("unable to open given file",0);
       }
    }
    else{
       if(na==0 || nz==0){
         printf("\nERROR:exiting-no input file given and na or nz is 0");
         fflush(stdout);
         ARMCI_Finalize();
         MP_FINALIZE();
         return;
       }
       if(me==0){
         generate_random_file(na,nz);
         fd = fopen("randominput.dat", "r");
       }
    }
    if(me==0){
       if(na==0)
         nread = fread(&na, sizeof(na), 1, fd);
       if(nz==0)
         nread = fread(&nz, sizeof(nz), 1, fd);
       printf("\nReading CG input\n");
       printf("Number of rows: %d\n", na);
       printf("Number of non-zeros: %d\n", nz);
    }

    armci_msg_bcast(&nz,sizeof(int),0);
    armci_msg_bcast(&na,sizeof(int),0);
    MP_BARRIER();

    amatptrs = (void **)malloc(sizeof(void *)*nproc); 
    xvecptrs = (void **)malloc(sizeof(void *)*nproc);
    if(xvecptrs==NULL || amatptrs==NULL)
      ARMCI_Error("xvecptrs amatptrs malloc failed",sizeof(void *)*nproc);

    if(ARMCI_Malloc(amatptrs,((me==0)?(sizeof(double)*nz):0)))
      ARMCI_Error("amat malloc failed",sizeof(double)*nz);
    amat = (double *)amatptrs[0];
    
    if(ARMCI_Malloc(amatptrs,((me==0)?(sizeof(int)*(nz+1)):0)))
      ARMCI_Error("icol malloc failed",sizeof(int)*(nz+1));
    cidx = (int *)amatptrs[0];
    
    ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(int)*(na+1)):0)); /*+1 for end of last row*/
    ridx = (int *)xvecptrs[0];

    ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*(na+1)):0));
    xvec = (double *)xvecptrs[0];

    ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*(na+1)):0));
    bvec = (double *)xvecptrs[0];

    if(me==0){

      for (i = 0; i < na + 1; i++)
        xvec[i] = 0.0;

      nread = fread(amat, sizeof(double), nz, fd);
      nread = fread(ridx, sizeof(int), (na+1), fd);
      ridx[na]=nz;
      nread = fread(cidx, sizeof(int), (nz+1), fd);
      nread = fread(bvec, sizeof(double), (na+1), fd);

      /* the c adjustment */
      for (i = 0; i < na; i++)
        ridx[i] -= 1;
         
      for (i = 0; i < nz; i++)
        cidx[i] -= 1;
    }
   
    MP_BARRIER();
    /*acg_matvecmul(amat,xvec,bvec,ridx,cidx);*/
    if(0){
    for(i=0;i<nz+1;i++)
      printf("\n%d:amat[%d]=%f icol[%d]=%d",me,i,amat[i],i,cidx[i]);
    for(i=0;i<na+1;i++)
      printf("\n%d:irow[%d]=%d bvec[%d]=%f",me,i,ridx[i],i,bvec[i]);
    }
    allfirstrow = (int *)malloc(sizeof(int)*nproc);
    alllastrow = (int *)malloc(sizeof(int)*nproc);
    columnmap = (int *)malloc(sizeof(int)*nproc);
    if(!allfirstrow || !alllastrow || !columnmap)
      ARMCI_Error("malloc failed allfirstrow ",0);
    MP_BARRIER();
    /* 
     * next decide who works on which rows, this will decide the
     * distribution of a,d,r,q,x,and ax
     */
    /*create the mapping for all vectors, row matrix and column matrix*/
    if(me==0){
       idealelementsperproc = nz/nproc;
       tmp1=0;
       for(i=0;i<nproc;i++){
         int elementsperproc=0;
         allfirstrow[i]=tmp1;
         for(ri=tmp1;ri<na;ri++,tmp1++){
           elementsperproc+=(ridx[ri+1]-ridx[ri]);
       if(elementsperproc>=idealelementsperproc){
             if((elementsperproc-idealelementsperproc) > 
                idealelementsperproc-(elementsperproc-(ridx[ri+1]-ridx[ri]))){
               alllastrow[i] = ri-1;  
           if((ri-1)<0)ARMCI_Error("run on a smaller processor count",0);
               /*tmp1--;*/
             }
             else{
               alllastrow[i] = ri;  
               if(ri<0)ARMCI_Error("run on a smaller processor count",0);
               tmp1++;
             }
             elementsperproc=0;
             break;
       }
         }
       }
       alllastrow[nproc-1]=na-1;
       for(i=0;i<nproc;i++)columnmap[i]=ridx[allfirstrow[i]];
    }
    armci_msg_bcast(columnmap,nproc*sizeof(int),0);
    armci_msg_bcast(allfirstrow,nproc*sizeof(int),0);
    armci_msg_bcast(alllastrow,nproc*sizeof(int),0);
    myfirstrow = allfirstrow[me];
    mylastrow = alllastrow[me];
    if(me==0)for(i=0;i<nproc;i++){
      printf("\nDISTRIBUTION:first row of process\t%d is %d last row of process\t%d is %d",i,allfirstrow[i],i,alllastrow[i]);
    }
    /*
    for(i=myfirstrow;i<mylastrow;i++){
            xvec[i]=0.0;
    }
    */
    ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0));
    rvec = (double *)xvecptrs[0];

    ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0));
    dvec = (double *)xvecptrs[0];

    ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0));
    svec = (double *)xvecptrs[0];

    ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0));
    dmvec = (double *)xvecptrs[0];

    ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0));
    qvec = (double *)xvecptrs[0];

    ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0));
    axvec = (double *)xvecptrs[0];

    if(me==0)fclose(fd);
    /*dont forget to free mallocs*/
    free(allfirstrow);
    free(alllastrow);
    free(columnmap);
}
Beispiel #25
0
int main(int argc, char *argv[])
{
    int ch;
    extern char *optarg;
    int i, j, r;
    thread_t threads[MAX_TPP];

    /* init MP */
    MP_INIT(argc,argv);
    MP_PROCS(&size);
    MP_MYID(&rank);

    while ((ch = getopt(argc, argv, "t:s:i:d:h")) != -1) {
        switch(ch) {
            case 't': /* # of threads */
                tpp = atoi(optarg);
                if (tpp < 1 || tpp > MAX_TPP) {
                    PRINTF0("\"%s\" is improper value for -t, should be a "
                            "number between 1 and %d(MAX_TPP)\n",
                            optarg, MAX_TPP);
                    usage();
                }
                break;
            case 'i': /* # of iterations */
                iters = atoi(optarg);
                if (iters < 1) {
                    PRINTF0("\"%s\" is improper value for -t, should be a "
                            "number equal or larger than 1\n", optarg);
                    usage();
                }
                break;
            case 's': /* # of elements in the array */
                asize = atoi(optarg);
                if (iters < 1) {
                    PRINTF0("\"%s\" is improper value for -s, should be a "
                            "number equal or larger than 1\n", optarg);
                    usage();
                }
                break;
            case 'd': delay = atoi(optarg); break; /* delay before start */
            case 'h': usage(); break; /* print usage info */
        }
    }
#ifdef NOTHREADS
    tpp = 1;
    PRINTF0("Warning: NOTHREADS debug symbol is set -- running w/o threads\n");
#endif
    th_size = size * tpp;
    PRINTF0("\nTest of multi-threaded capabilities:\n"
            "%d threads per process (%d threads total),\n"
            "%d array elements of size %d,\n"
            "%d iteration(s)\n\n", tpp, th_size, asize, sizeof(atype_t), iters);
    if (delay) {
        printf("%d: %d\n", rank, getpid());
        fflush(stdout);
        sleep(delay);
        MP_BARRIER();
    }
    TH_INIT(size,tpp);
    for (i = 0; i < tpp; i++) th_rank[i] = rank * tpp + i;

#if defined(DEBUG) && defined(LOG2FILE)
    for (i = 0; i < tpp; i++) {
        fname[10] = '0' + th_rank[i] / 100;
        fname[11] = '0' + th_rank[i] % 100 / 10;
        fname[12] = '0' + th_rank[i] % 10;
        dbg[i] = fopen(fname, "w");
    }
#endif
    for (i = 0; i < tpp; i++)
        prndbg(i, "proc %d, thread %d(%d):\n", rank, i, th_rank[i]);

    /* init ARMCI */
    ARMCI_Init();

    /* set global seed (to ensure same random sequence across procs) */
    time_seed = (unsigned)time(NULL);
    armci_msg_brdcst(&time_seed, sizeof(time_seed), 0);
    srand(time_seed); rand();
    prndbg(0, "seed = %u\n", time_seed);
    /* random pairs */
    pairs = calloc(th_size, sizeof(int));
    for (i = 0; i < th_size; i++) pairs[i] = -1;
    for (i = 0; i < th_size; i++) {
        if (pairs[i] != -1) continue;
        r = RND(0, th_size);
        while (i == r || pairs[r] != -1 ) r = RND(0, th_size);
        pairs[i] = r; pairs[r] = i;
    }
    for (i = 0, cbufl = 0; i < th_size; i++)
        cbufl += sprintf(cbuf + cbufl, " %d->%d|%d->%d",
                         i, pairs[i], pairs[i], pairs[pairs[i]]);
    prndbg(0, "random pairs:%s\n", cbuf);
    /* random targets */
    rnd_tgts = calloc(th_size, sizeof(int));
    for (i = 0, cbufl = 0; i < th_size; i++) {
        rnd_tgts[i] = RND(0, th_size);
        if (rnd_tgts[i] == i) { i--; continue; }
        cbufl += sprintf(cbuf + cbufl, " %d", rnd_tgts[i]);
    }
    prndbg(0, "random targets:%s\n", cbuf);
    /* random one */
    rnd_one = RND(0, th_size);
    prndbg(0, "random one = %d\n", rnd_one);

    assert(ptrs1 = calloc(th_size, sizeof(void *)));
    assert(ptrs2 = calloc(th_size, sizeof(void *)));
#ifdef NOTHREADS
    thread_main((void *)(long)0);
#else
    for (i = 0; i < tpp; i++) THREAD_CREATE(threads + i, thread_main, (void *)(long)i);
    for (i = 0; i < tpp; i++) THREAD_JOIN(threads[i], NULL);
#endif

    MP_BARRIER();
    PRINTF0("Tests Completed\n");

    /* clean up */
#if defined(DEBUG) && defined(LOG2FILE)
    for (i = 0; i < tpp; i++) fclose(dbg[i]);
#endif
    ARMCI_Finalize();
    TH_FINALIZE();
    MP_FINALIZE();

	return 0;
}
Beispiel #26
0
void lu(int n, int bs, int me)
{
  int i, il, j, jl, k, kl;
  int I, J, K;
  double *A, *B, *C, *D;
  int dimI, dimJ, dimK;
  int strI, strJ, strK;
  unsigned int t1, t2, t3, t4, t11, t22;
  int diagowner, destp, hc, m;
  double *dbuf;
  armci_hdl_t handle[2*MAXPROC];
  int saved[MAXPROC];  
  
  dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double));

  for (k=0, K=0; k<n; k+=bs, K++) {
    kl = k + bs; 
    if (kl > n) {
      kl = n;
      strK = kl - k;
    } else {
      strK = bs;
    }
    
    /* factor diagonal block */
    diagowner = block_owner(K, K);
    if (diagowner == me) {
      A = a[K+K*nblocks]; 
      lu0(A, strK, strK); /* impl algo on this diag block */
    }
    MP_BARRIER(); 
    
    /* divide column k by diagonal block */
    if(block_owner(K, K) == me)
      D = a[K+K*nblocks];
    else {
      D = dbuf;
      get_remote(D, K, K);
    }
    
    for (i=kl, I=K+1; i<n; i+=bs, I++) {
      if (block_owner(I, K) == me) {  /* parcel out blocks */
	il = i + bs; 
	if (il > n) {
	  il = n;
	  strI = il - i;
	} else {
	  strI = bs;
	}
	A = a[I+K*nblocks]; 
	bdiv(A, D, strI, strK, strI, strK);
	
	/* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/
	memset (saved, 0, sizeof(saved));
	for (m = K+1; m < nblocks; m++) {
	    destp = block_owner (I, m);
	    if (destp != me && !saved[destp]) {
	      ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL);
	      saved[destp] = 1;
	    }
	}
      }
    } /* end of for (i=k1, I=K+1...) */
    
    /* modify row k by diagonal block */
    for (j=kl, J=K+1; j<n; j+=bs, J++) {
      if (block_owner(K, J) == me) {  /* parcel out blocks */
	jl = j+bs; 
	if (jl > n) {
	  jl = n;
	  strJ = jl - j;
	} else {
	  strJ = bs;
	}
	A = a[K+J*nblocks];
	bmodd(D, A, strK, strJ, strK, strK);
     
	/* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/
        memset (saved, 0, sizeof(saved));
        for (m = K+1; m < nblocks; m++) {
	  destp = block_owner (m, J);
	  if (destp != me  && !saved[destp]) {
	    ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL);
	    saved[destp] = 1;
	  }
	}
      }      
    }
        
    ARMCI_WaitAll();
    ARMCI_AllFence();
    MP_BARRIER();
    /* modify subsequent block columns */
    
    for (i=kl, I=K+1; i<n; i+=bs, I++) {
      il = i+bs; 
      if (il > n) {
	il = n;
	strI = il - i;
      } else {
	strI = bs;
      }

      for (j=kl, J=K+1; j<n; j+=bs, J++) {
	jl = j + bs; 
	if (jl > n) {
	  jl = n;
	  strJ= jl - j;
	} else {
	  strJ = bs;
	  }
	if (block_owner(I, J) == me) {  /* parcel out blocks */
	  if(block_owner(I,K) == me)
	    A = a[I+K*nblocks];
	  else {
	    A = bufc[me*nblocks+I];
          }
	  
	  if(block_owner(K,J) == me)
	    B = a[K+J*nblocks];
	  else
	    B = bufr[me*nblocks + J];
	    
	  C = a[I+J*nblocks];
	  bmod(A, B, C, strI, strJ, strK, strI, strK, strI);
	}
      }
    }
  }
  ARMCI_Free_local(dbuf);
}
Beispiel #27
0
void lu(int n, int bs, int me)
{
    int i, il, j, jl, k, kl;
    int I, J, K;
    double *A, *B, *C, *D;
    int dimI, dimJ, dimK;
    int strI, strJ, strK;
    unsigned int t1, t2, t3, t4, t11, t22;
    int diagowner;
    double *buf1, *buf2;

    /* temporary memories */
    buf1 = (double *)malloc(block_size*block_size*sizeof(double));
    buf2 = (double *)malloc(block_size*block_size*sizeof(double));

    for (k=0, K=0; k<n; k+=bs, K++) {
        kl = k + bs;
        if (kl > n) {
            kl = n;
            strK = kl - k;
        } else {
            strK = bs;
        }

        /* factor diagonal block */
        diagowner = block_owner(K, K);
        if (diagowner == me) {
            A = a[K+K*nblocks];
            lu0(A, strK, strK);
        }
        MP_BARRIER();

        /* divide column k by diagonal block */
        if(block_owner(K, K) == me)
            D = a[K+K*nblocks];
        else {
            D = buf1;
            get_remote(D, K, K);
        }
        for (i=kl, I=K+1; i<n; i+=bs, I++) {
            if (block_owner(I, K) == me) {  /* parcel out blocks */
                il = i + bs;
                if (il > n) {
                    il = n;
                    strI = il - i;
                } else {
                    strI = bs;
                }
                A = a[I+K*nblocks];
                bdiv(A, D, strI, strK, strI, strK);
            }
        }

        /* modify row k by diagonal block */
        for (j=kl, J=K+1; j<n; j+=bs, J++) {
            if (block_owner(K, J) == me) {  /* parcel out blocks */
                jl = j+bs;
                if (jl > n) {
                    jl = n;
                    strJ = jl - j;
                } else {
                    strJ = bs;
                }
                A = a[K+J*nblocks];
                bmodd(D, A, strK, strJ, strK, strK);
            }
        }

        MP_BARRIER();

        /* modify subsequent block columns */
        for (i=kl, I=K+1; i<n; i+=bs, I++) {
            il = i+bs;
            if (il > n) {
                il = n;
                strI = il - i;
            } else {
                strI = bs;
            }

            if(block_owner(I,K) == me)
                A = a[I+K*nblocks];
            else {
                A = buf1;
                get_remote(A, I, K);
            }
            for (j=kl, J=K+1; j<n; j+=bs, J++) {
                jl = j + bs;
                if (jl > n) {
                    jl = n;
                    strJ= jl - j;
                } else {
                    strJ = bs;
                }
                if (block_owner(I, J) == me) {  /* parcel out blocks */
                    if(block_owner(K,J) == me)
                        B = a[K+J*nblocks];
                    else {
                        B = buf2;
                        get_remote(B, K, J);
                    }
                    C = a[I+J*nblocks];
                    bmod(A, B, C, strI, strJ, strK, strI, strK, strI);
                }
            }
        }
    }

    free(buf1);
    free(buf2);
}
Beispiel #28
0
void test_2D()
{
    int i;
    int src, dst;
    int ierr;
    double *buf;
    void *ptr[MAXPROC], *get_ptr[MAXPROC];

    /* find who I am and the dst process */
    src = me;
    
#ifdef MALLOC_LOC
    if(me == 0) {
        buf = (double *)ARMCI_Malloc_local(SIZE * SIZE * sizeof(double));
        assert(buf != NULL);
    }
#else
    if(me == 0) {
        buf = (double *)malloc(SIZE * SIZE * sizeof(double));
        assert(buf != NULL);
    }
#endif

    ierr = ARMCI_Malloc(ptr, (SIZE * SIZE * sizeof(double)));
    assert(ierr == 0); assert(ptr[me]);
    ierr = ARMCI_Malloc(get_ptr, (SIZE * SIZE * sizeof(double)));
    assert(ierr == 0); assert(get_ptr[me]);
    
    /* ARMCI - initialize the data window */
    fill_array(ptr[me], SIZE*SIZE, me);
    fill_array(get_ptr[me], SIZE*SIZE, me);

    MP_BARRIER();
    
    /* only the proc 0 doest the work */
    /* print the title */
    if(me == 0) {
        if(!CHECK_RESULT){
           printf("  section               get                 put");
           printf("                 acc\n");
           printf("bytes   loop       sec      MB/s       sec      MB/s");
           printf("       sec      MB/s\n");
           printf("------- ------  --------  --------  --------  --------");
           printf("  --------  --------\n");
           fflush(stdout);
        }
        
        for(i=0; i<CHUNK_NUM; i++) {
            int loop;
            int bytes = chunk[i] * chunk[i] * sizeof(double);

            double t_get = 0, t_put = 0, t_acc = 0;
            double latency_get, latency_put, latency_acc;
            double bandwidth_get, bandwidth_put, bandwidth_acc;
            
            loop = SIZE / chunk[i];
            if(loop<2)loop=2;

            for(dst=1; dst<nproc; dst++) {
                /* strided get */
                fill_array(buf, SIZE*SIZE, me*10);
                t_get += time_get((double *)(get_ptr[dst]), (double *)buf,
                                 chunk[i], loop, dst, 1);
 
                /* strided put */
                fill_array(buf, SIZE*SIZE, me*10);
                t_put += time_put((double *)buf, (double *)(ptr[dst]),
                                 chunk[i], loop, dst, 1);
                
                /* strided acc */
                fill_array(buf, SIZE*SIZE, me*10);
                t_acc += time_acc((double *)buf, (double *)(ptr[dst]),
                                 chunk[i], loop, dst, 1);
            }
            
            latency_get = t_get/(nproc - 1);
            latency_put = t_put/(nproc - 1);
            latency_acc = t_acc/(nproc - 1);
            
            bandwidth_get = (bytes * (nproc - 1) * 1e-6)/t_get;
            bandwidth_put = (bytes * (nproc - 1) * 1e-6)/t_put;
            bandwidth_acc = (bytes * (nproc - 1) * 1e-6)/t_acc;

            /* print */
            if(!CHECK_RESULT)printf("%d\t%d\t%.2e  %.2e  %.2e  %.2e  %.2e  %.2e\n",
                       bytes, loop, latency_get, bandwidth_get,
                       latency_put, bandwidth_put, latency_acc, bandwidth_acc);
        }
    }
    else sleep(3);
    
    ARMCI_AllFence();
    MP_BARRIER();

    /* cleanup */
    ARMCI_Free(get_ptr[me]);
    ARMCI_Free(ptr[me]);

#ifdef MALLOC_LOC
    if(me == 0) ARMCI_Free_local(buf);
#else
    if(me == 0) free(buf);
#endif

}
Beispiel #29
0
main(int argc, char *argv[])
{
    int i, j;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    int nloop=5;
    double **ptr_loc;

    MP_INIT(arc,argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
        case 'n':
            n = atoi(optarg);
            break;
        case 'b':
            block_size = atoi(optarg);
            break;
        case 'p':
            nproc = atoi(optarg);
            break;
        case 'h': {
            printf("Usage: LU, or \n");
            printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
            MP_BARRIER();
            MP_FINALIZE();
            exit(0);
        }
        }
    }

    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }

    num_rows = (int) sqrt((double) nproc);
    for (;;) {
        num_cols = nproc/num_rows;
        if (num_rows*num_cols == nproc)
            break;
        num_rows--;
    }

    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }

    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }

#ifdef DEBUG
    if(me == 0)
        for (i=0; i<nblocks; i++) {
            for (j=0; j<nblocks; j++)
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    MP_BARRIER();
    MP_FINALIZE();
    exit(0);
#endif

    for (i=0; i<nblocks; i++) {
        for (j=0; j<nblocks; j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }

    ptr = (void **)malloc(nproc * sizeof(void *));
#ifdef MPI2_ONESIDED
    MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]);
    MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL,
                   MPI_COMM_WORLD, &win);
    for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me];
    MPI_Barrier(MPI_COMM_WORLD);

#else
    /* initialize ARMCI */
    ARMCI_Init();
    ARMCI_Malloc(ptr, proc_bytes);
#endif

    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    }
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks; i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }

    /* initialize the array */
    init_array();

    /* barrier to ensure all initialization is done */
    MP_BARRIER();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    MP_BARRIER();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me);
        }
        MP_BARRIER();
    }

    lu(n, block_size, me); /* cold start */

    /* Starting the timer */

    MP_BARRIER();
    if(me == 0) start_timer();
    for(i=0; i<nloop; i++) lu(n, block_size, me);
    MP_BARRIER();

    /* Timer Stops here */
    if(me == 0)
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time()/nloop);
    printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop);

    if(doprint) {
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        MP_BARRIER();
    }

    /* done */
#ifdef MPI2_ONESIDED
    MPI_Win_free(&win);
    MPI_Free_mem(ptr[me]);
#else
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
#endif
    MP_FINALIZE();
}
Beispiel #30
0
int main(int argc, char *argv[])
{
    int i, j;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    
    /* ARMCI */
    void **ptr;
    double **ptr_loc;
    
    MP_INIT(argc,argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);
    
    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
            case 'n': n = atoi(optarg); break;
            case 'b': block_size = atoi(optarg); break;
            case 'p': nproc = atoi(optarg); break;
            case 'h': {
                printf("Usage: LU, or \n");
        printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
                MP_BARRIER();
                MP_FINALIZE();
                exit(0);
            }            
        }
    }
    
    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }
    
/*      num_rows = (int) sqrt((double) nproc); */
/*      for (;;) { */
/*          num_cols = nproc/num_rows; */
/*          if (num_rows*num_cols == nproc) */
/*              break; */
/*          num_rows--; */
/*      } */
    
    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }

    nnodes = nproc / 4;
    if((nnodes * 4) != nproc) {
        num_cols = nproc - nnodes * 4;
        nnodes++;
        num_rows = 1;
    }
    else {
        num_cols = 2;
        num_rows = 2;
    }    
    
    num = (nblocks * nblocks)/nnodes;
    if((num * nnodes) != (nblocks * nblocks))
        num++;

#ifdef DEBUG
    if(me == 0)
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) 
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    MP_BARRIER();
    MP_FINALIZE();
    exit(0);
#endif
    
    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }
    
    for (i=0;i<nblocks;i++) {
        for (j=0;j<nblocks;j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }
    
    /* initialize ARMCI */
    ARMCI_Init();
    ptr = (void **)malloc(nproc * sizeof(void *));
    ARMCI_Malloc(ptr, proc_bytes);
    
    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    } 
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks;i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }
    
    /* initialize the array */
    init_array();
    
    /* barrier to ensure all initialization is done */
    MP_BARRIER();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    MP_BARRIER();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me); 
        }
        MP_BARRIER();
    }
    

    /* Starting the timer */
    if(me == 0) start_timer();

    lu(n, block_size, me);
    
    MP_BARRIER();

    /* Timer Stops here */
    if(me == 0) 
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time());

    if(doprint) {        
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        MP_BARRIER();
    }
    
    /* done */
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
    MP_FINALIZE();

    return 0;
}