Ejemplo n.º 1
0
void test_collective(const int datatype)
{
    char * op[7] = {"+", "*", "min", "max", "absmax", "absmin", "or"};
    int i = 0;
    int num_tests = 7;
    if(datatype == ARMCI_DOUBLE || datatype == ARMCI_FLOAT)
       num_tests = 6;
    
    /* test armci_msg_brdcst */
    test_brdcst(datatype);
    
    /* test armci_msg_gop2 */
    for(i = 0; i < num_tests; i++)
       test_gop2_or_reduce(datatype, op[i], 0);

    /* test armci_msg_reduce */
    for(i = 0; i < num_tests; i++)
       test_gop2_or_reduce(datatype, op[i], 1);

    MP_BARRIER();
    ARMCI_AllFence();
    MP_BARRIER();
    
    if(me==0){printf("O.K.\n\n"); fflush(stdout);}    
}
Ejemplo n.º 2
0
void test_groups() {
  
    int pid_listA[MAXPROC]  = {0,1,2};
    int pid_listB[MAXPROC] = {1,3};
    ARMCI_Group groupA, groupB;

    MP_BARRIER();

    ARMCI_Group_create(GNUM_A, pid_listA, &groupA); /* create group 1 */
    ARMCI_Group_create(GNUM_B, pid_listB, &groupB); /* create group 2 */


    /* ------------------------ GROUP A ------------------------- */ 
    if(chk_grp_membership(me, &groupA, pid_listA)) { /* group A */
      test_one_group(&groupA, pid_listA);
    }

    MP_BARRIER();
    
    /* ------------------------ GROUP B ------------------------- */ 
    if(chk_grp_membership(me, &groupB, pid_listB)) { /* group B */
      test_one_group(&groupB, pid_listB);
    }

    ARMCI_AllFence();
    MP_BARRIER();
    
    if(me==0){printf("O.K.\n"); fflush(stdout);}
}
Ejemplo n.º 3
0
int main(int argc, char* argv[])
{

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

/*    printf("nproc = %d, me = %d\n", nproc, me);*/
    
    if( (nproc<MINPROC || nproc>MAXPROC) && me==0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\n Testing ARMCI Groups!\n\n");
      fflush(stdout);
    }

    test_groups();
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\n Collective groups: Success!!\n"); fflush(stdout);}
    sleep(2);

#ifdef ARMCI_GROUP
    test_groups_noncollective();

    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\n Non-collective groups: Success!!\n"); fflush(stdout);}
    sleep(2);
#endif
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
Ejemplo n.º 4
0
void DDI_ARMCI_Barrier(const DDI_Comm *comm) {
    if (comm == (const DDI_Comm *)Comm_find(DDI_COMM_WORLD)) {
	ARMCI_Barrier();
    }
    else {
	ARMCI_AllFence();
	MPI_Barrier(comm->compute_comm);
    }
}
Ejemplo n.º 5
0
void test_groups_noncollective() {
  int *pid_lists[MAX_GROUPS];
  int pids[MAXPROC];
  int i, nprocs, world_me;
  ARMCI_Group group;
  int *my_pid_list=NULL, my_grp_size=0;
  int ngrps;

  MP_BARRIER();
  MP_PROCS(&nprocs);
  MP_MYID(&world_me);

  random_permute(pids, nproc);

  ngrps = nprocs/GROUP_SIZE;
  
  for(i=0; i<nprocs/GROUP_SIZE; i++) {
    pid_lists[i] = pids + (i*GROUP_SIZE);
  }

  for(i=0; i<nprocs; i++) {
    if(pids[i] == world_me) {
      int grp_id = ARMCI_MIN(i/GROUP_SIZE, ngrps-1);
      my_pid_list = pid_lists[grp_id];
      if(grp_id == ngrps-1)
	my_grp_size =  GROUP_SIZE + (nprocs%GROUP_SIZE);
      else
	my_grp_size = GROUP_SIZE;
    }
  }

  qsort(my_pid_list, my_grp_size, sizeof(int), int_compare);
  
  MP_BARRIER();
  /*now create all these disjoint groups and test them in parallel*/
  
  ARMCI_Group_create(my_grp_size, my_pid_list, &group);

  test_one_group(&group, my_pid_list);

  ARMCI_Group_free(&group);

  ARMCI_AllFence();
  MP_BARRIER();
  
  if(world_me==0){printf("O.K.\n"); fflush(stdout);}
}
Ejemplo n.º 6
0
int main(int argc, char* argv[])
{
ARMCI_NetInit();

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

    if(nproc < 2 || nproc> MAXPROC) {
      if(me == 0)
	fprintf(stderr,
		"USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc);
      MP_BARRIER();
      MP_FINALIZE();
      exit(0);
    }

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\n put/get/acc requests (Time in secs)\n\n");
      fflush(stdout);
    }

    test_perf_nb(1);
    test_perf_nb(0);
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}
    sleep(2);
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
int main(int argc, char* argv[])
{

    armci_msg_init(&argc, &argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();

    /*    printf("nproc = %d, me = %d\n", nproc, me);*/

    if(nproc>MAXPROC && me==0)
        ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0) {
        printf("ARMCI test program (%d processes)\n",nproc);
        fflush(stdout);
        sleep(1);
    }

    ARMCI_Init();

    if(me==0) {
        printf("\n  Performing Sparse Matrix-Vector Multiplication ...\n\n");
        fflush(stdout);
    }
    test_sparse();

    ARMCI_AllFence();
    armci_msg_barrier();
    if(me==0) {
        printf("\nSuccess!!\n");
        fflush(stdout);
    }
    sleep(2);

    armci_msg_barrier();
    ARMCI_Finalize();
    armci_msg_finalize();
    return(0);
}
Ejemplo n.º 8
0
int main(int argc, char *argv[])
{
  ARMCI_Init_args(&argc, &argv);
  nproc = armci_msg_nproc();
  me = armci_msg_me();

  /*    printf("nproc = %d, me = %d\n", nproc, me);*/

  if (nproc > MAXPROC && me == 0) {
    ARMCI_Error("Test works for up to %d processors\n", MAXPROC);
  }

  if (me == 0) {
    printf("ARMCI test program (%d processes)\n", nproc);
    fflush(stdout);
    sleep(1);
  }

  if (me == 0) {
    printf("\nAggregate put/get requests\n\n");
    fflush(stdout);
  }
  test_aggregate(1); /* cold start */
  test_aggregate(0); /* warm start */

  ARMCI_AllFence();
  ARMCI_Barrier();
  if (me == 0) {
    printf("\nSuccess!!\n");
    fflush(stdout);
  }
  sleep(2);

  ARMCI_Barrier();
  ARMCI_Finalize();
  armci_msg_finalize();
  return(0);
}
Ejemplo n.º 9
0
int main(int argc, char* argv[])
{

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

/*    printf("nproc = %d, me = %d\n", nproc, me);*/
    
    if(nproc>MAXPROC && me==0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\nAggregate put/get requests\n\n");
      fflush(stdout);
    }
    test_aggregate(1); /* cold start */
    test_aggregate(0); /* warm start */
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}
    sleep(2);
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
Ejemplo n.º 10
0
void TRANSPOSE1D() {
    
    int dims[1];
    int nelem, i, ierr, min, max, cmin, cmax, lmin, lmax, pmin, pmax;    
    int src_offset, dst_offset, length;
    int *buf, *map;
    void *src_ptr, *dst_ptr;
    void **a_ptr, **b_ptr;
    int *a, *b;

    /* Find local processor ID and number of processors */
    int me, nprocs;
    me     = armci_msg_me();
    nprocs = armci_msg_nproc();

    /* Allocate pointers to data on all processors */
    a_ptr = (void**)malloc(nprocs*sizeof(int*));
    b_ptr = (void**)malloc(nprocs*sizeof(int*));
    map = (int*)malloc(nprocs*sizeof(int));

    /* Configure array dimensions. Force an unequal data distribution */
    dims[0]  = nprocs*TOTALELEMS + nprocs/2;
    if (me == 0) printf("Size of array: %d\n\n",dims[0]);
    /* Find first (zero-based) index of chunk owned by each processor and
       store it in map array */
    for (i=0; i<nprocs; i++) {
      map[i] = (int)(((double)i)*(((double)dims[0])/((double)nprocs)));
    }

    /* Figure out what size my portion of array is */
    if (me<nprocs-1) {
      nelem = map[me+1]-map[me];
    } else {
      nelem = dims[0]-map[me];
    }

    /* Allocate memory for array A */
    ierr = ARMCI_Malloc(a_ptr, nelem*sizeof(int));
    assert(ierr == 0);
    assert(a_ptr[me]);

    /* Allocate memory for array B */
    ierr = ARMCI_Malloc(b_ptr, nelem*sizeof(int));
    assert(ierr == 0);
    assert(b_ptr[me]);
    
    /* initialize data in array A and zero data in array B */
    a = (int*)a_ptr[me];
    b = (int*)b_ptr[me];
    for (i=0; i<nelem; i++) {
      a[i] = i + map[me] + 1;
      b[i] = 0;
    }

    /* Synchronize all processors to guarantee that everyone has data
       before proceeding to the next step. */
    armci_msg_barrier();

    /* Create local buffer for performing inversion */
    buf = (int*)malloc(nelem*sizeof(int));

    /* Copy inverted data into local buffer */
    a = (int*)a_ptr[me];
    for (i=0; i<nelem; i++) {
      buf[i] = a[nelem-i-1]; 
    }

    /* Find out which blocks of array B inverted block should be copied to.
       Start by finding min and max indices of data in array B*/
    min = dims[0] - (map[me] + nelem);
    max = dims[0] - map[me] - 1;

    /* Locate processors containing the endpoints */
    pmin = 0;
    for (i=0; i<nprocs; i++) {
      if (min >= map[i]) {
        pmin = i;
      } else {
        break;
      }
    }
    pmax = nprocs-1;
    for (i=nprocs-2; i>=0; i--) {
      if (max < map[i+1]) {
        pmax = i;
      } else {
        break;
      }
    }

    /* Loop over processors that will receive data and copy inverted data to
       processors */
    for (i=pmin; i<=pmax; i++) {
      /* Find min and max indices owned by processor i */
      lmin = map[i];
      if (i<nprocs-1) {
        lmax = map[i+1]-1;
      } else {
        lmax = dims[0]-1;
      }

      /* Find min and max indices that should be sent to processor i */
      if (lmin > min) {
        cmin = lmin;
      } else {
        cmin = min;
      }
      if (lmax < max) {
        cmax = lmax;
      } else {
        cmax = max;
      }

      /* Find offsets on source and destination processors */
      src_offset = cmin - min;
      src_ptr = (void*)(buf + src_offset);
      dst_offset = cmin - lmin;
      dst_ptr = ((char*)b_ptr[i]) + sizeof(int)*dst_offset;
      
      /* Find length of data (in bytes) to be sent to processor i */
      length = sizeof(int)*(cmax-cmin+1);

      /* Send data to processor */
      ARMCI_Put(src_ptr, dst_ptr, length, i);
    }
    ARMCI_AllFence();
    armci_msg_barrier();
    
    free(buf);

    VERIFY(b_ptr, dims, map);

    free(map);
    armci_msg_barrier();
    ARMCI_Free(a_ptr[me]);
    ARMCI_Free(b_ptr[me]);
    free(a_ptr);
    free(b_ptr);
}
Ejemplo n.º 11
0
int main(int argc, char* argv[])
{
    int i;
    struct timeval start_time[14];
    struct timeval stop_time[14];
    /*
      char * test_name[14] = {
      "dim", "nbdim", "vec_small", "acc",
      "vector", "vector_acc", "fetch_add",
      "swap", "rput", "aggregate", "implicit",
      "memlock", "acc_type", "collective"
      };
      int test_flags[14] = {
      1, 1, 1, 1,
      1, 1, 1,
      1, 1, 0, 1,
      1, 1, 1
      };
    */
    char * test_name[2] = { "acc_type", "collective" };
    int test_flags[2]   = { 1, 1 };

#define TEST_ACC_TYPE   0
#define TEST_COLLECTIVE 1

    MP_INIT(argc, argv);
    ARMCI_Init();
    MP_PROCS(&nproc);
    MP_MYID(&me);

    if(nproc > MAXPROC && me == 0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me == 0)
    {
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }    

    gettimeofday(&start_time[TEST_ACC_TYPE],NULL);
    if(test_flags[TEST_ACC_TYPE] == 1)
    {
       if(me == 0)
       {
          printf("\nTesting Accumulate Types\n");
          fflush(stdout);
       }
       
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_INT\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_INT);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_LNG\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_LNG);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_FLT\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_FLT);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_DBL\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_DBL);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_CPL\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_CPL);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_DCP\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_DCP);
       ARMCI_AllFence();
       MP_BARRIER();
    }
    gettimeofday(&stop_time[TEST_ACC_TYPE],NULL);

    gettimeofday(&start_time[TEST_COLLECTIVE],NULL);
    if(test_flags[TEST_COLLECTIVE] == 1)
    {
       if(me == 0)
       {
          printf("\nTesting Collective Types\n");
          fflush(stdout);
       }
       if(me == 0)
       {
          printf("Test Collective ARMCI_INT\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_INT);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_LONG\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_LONG);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_FLOAT\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_FLOAT);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_DOUBLE\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_DOUBLE);
       MP_BARRIER();
    }
    gettimeofday(&stop_time[TEST_COLLECTIVE],NULL);
    
    if(me == 0)
    {
       printf("Accumulate and Collective tests passed\n");
       fflush(stdout);
    }

    if(me == 0)
    {
       printf("Testcase runtime\n");
       printf("Name,Time(seconds)\n");
       for(i = 0; i < 2; i++)
          if(test_flags[i] == 1)
          {
             double time_spent = (stop_time[i].tv_sec - start_time[i].tv_sec) + ((double) stop_time[i].tv_usec - start_time[i].tv_usec) / 1E6;
             printf("%s,%.6f\n", test_name[i], time_spent);
          }
    }

    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
Ejemplo n.º 12
0
void test_acc_type(const int datatype)
{
    int i = 0;
    int datatype_size = 0;
    void * scale;
    void * a;
    void *b[MAXPROC];
    int elems = ELEMS;
    int dim = 1;
    int count = 0;
    int strideA = 0;
    int strideB = 0;

    switch(datatype)
    {
       case ARMCI_ACC_INT:
          datatype_size = sizeof(int);
          scale = malloc(datatype_size);
          *((int *) scale) = 1;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((int *) a)[i] = i + me;
             ((int *) b[me])[i] = 0;
          }
          break;
       case ARMCI_ACC_LNG:
          datatype_size = sizeof(long);
          scale = malloc(datatype_size);
          *((long *) scale) = 1;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((long *) a)[i] = i + me;
             ((long *) b[me])[i] = 0;
          }
          break;
       case ARMCI_ACC_FLT:
          datatype_size = sizeof(float);
          scale = malloc(datatype_size);
          *((float *) scale) = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((float *) a)[i] = (float) i + me;
             ((float *) b[me])[i] = 0.0;
          }
          break;
       case ARMCI_ACC_DBL:
          datatype_size = sizeof(double);
          scale = malloc(datatype_size);
          *((double *) scale) = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((double *) a)[i] = (double) i + me;
             ((double *) b[me])[i] = 0.0;
          }
          break;
       case ARMCI_ACC_CPL:
          datatype_size = sizeof(cmpl_t);
          scale = malloc(datatype_size);
          ((cmpl_t *) scale)->real = 2.0;
          ((cmpl_t *) scale)->imag = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((cmpl_t *) a)[i].real = ((float) i + me);
             ((cmpl_t *) a)[i].imag = ((float) i + me);
             ((cmpl_t *) b[me])[i].real = 0.0;
             ((cmpl_t *) b[me])[i].imag = 0.0;
          }
          break;
       case ARMCI_ACC_DCP:
          datatype_size = sizeof(dcmpl_t);
          scale = malloc(datatype_size);
          ((dcmpl_t *) scale)->real = 2.0;
          ((dcmpl_t *) scale)->imag = 1.0;
          a = malloc(elems * datatype_size);
          create_array((void**)b, datatype_size, dim, &elems);
          for(i = 0; i < elems; i++)
          {
             ((dcmpl_t *) a)[i].real = ((double) i + me);
             ((dcmpl_t *) a)[i].imag = ((double) i + me);
             ((dcmpl_t *) b[me])[i].real = 0.0;
             ((dcmpl_t *) b[me])[i].imag = 0.0;
          }
          break;
       default:
          return;
          break;
    }

    count = elems * datatype_size;
    strideA = elems * datatype_size;
    strideB = elems * datatype_size;

    ARMCI_AllFence();
    MP_BARRIER();

    for(i = 0; i < nproc; i++)
       ARMCI_AccS(datatype, scale, a, &strideA, b[(me + i) % nproc], &strideB, &count, 0, (me + i) % nproc);

    ARMCI_AllFence();
    MP_BARRIER();

    switch(datatype)
    {
       case ARMCI_ACC_INT:
          for(i = 0; i < elems; i++)
          {
             int compare = (i * nproc) + nproc / 2 * (nproc - 1);
             if(((int *)b[me])[i] != compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_INT [%d] = %d != %d\n", i, ((int *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_LNG:
          for(i = 0; i < elems; i++)
          {
             long compare = (i * nproc) + nproc / 2 * (nproc - 1);
             if(((long *)b[me])[i] != compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_LNG [%d] = %d != %ld\n", i, ((int *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_FLT:
          for(i = 0; i < elems; i++)
          {
             float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((float *)b[me])[i] != compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_FLT [%d] = %f != %f\n", i, ((float *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_DBL:
          for(i = 0; i < elems; i++)
          {
             double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((double *)b[me])[i] != (double) ((i * nproc) + nproc / 2 * (nproc - 1))) 
             {
                printf("ERROR accumulate ARMCI_ACC_DBL [%d] = %f != %f \n", i, ((double *)b[me])[i], compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_CPL:
          for(i = 0; i < elems; i++)
          {
             float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((cmpl_t *)b[me])[i].real != compare && ((cmpl_t *)b[me])[i].imag != 3 * compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_CPL [%d] = %f + %fj != %f + %fj\n", i, ((cmpl_t *)b[me])[i].real, ((cmpl_t *)b[me])[i].imag, compare, 3 * compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       case ARMCI_ACC_DCP:
          for(i = 0; i < elems; i++)
          {
             double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1));
             if(((dcmpl_t *)b[me])[i].real != compare && ((dcmpl_t *)b[me])[i].imag != 3 * compare) 
             {
                printf("ERROR accumulate ARMCI_ACC_DCP [%d] = %f + %fj != %f + %fj\n", i, ((dcmpl_t *)b[me])[i].real, ((dcmpl_t *)b[me])[i].imag, compare, 3 * compare);
                ARMCI_Error("test_acc_type failed\n",0);
             }
          }
          break;
       default:
          break;
    }

    MP_BARRIER();
    ARMCI_AllFence();
    MP_BARRIER();
    
    if(me==0){printf("O.K.\n\n"); fflush(stdout);}    
    destroy_array((void**)b);
    free(a);
    free(scale);
}
Ejemplo n.º 13
0
void test_perf_nb(int dry_run) {
  
    int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
    int stride, k=0, ntimes;
    double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9;
    double *dsrc[MAXPROC], scale=1.0;
    armci_hdl_t hdl_get, hdl_put, hdl_acc;
        
    create_array((void**)ddst, sizeof(double),2, elems);
    create_array((void**)dsrc, sizeof(double),1, &elems[1]);

    if(!dry_run)if(me == 0) {
      printf("\n\t\t\tRemote 1-D Array Section\n");
      printf("section    get      nbget    wait     put     nbput  ");
      printf("   wait     acc     nbacc     wait\n");
      printf("-------  -------- -------- -------- -------- --------");
      printf(" -------- -------- -------- --------\n");
      fflush(stdout);
    }

    for(loop=1; loop<=MAXELEMS; loop*=2, k++) {

      elems[1] = loop;
      ntimes = (int)sqrt((double)(MAXELEMS/elems[1]));
      if(ntimes <1) ntimes=1;

      /* -------------------------- SETUP --------------------------- */
      /*initializing non-blocking handles,time,src & dst buffers*/
      ARMCI_INIT_HANDLE(&hdl_put);
      ARMCI_INIT_HANDLE(&hdl_get);
      ARMCI_INIT_HANDLE(&hdl_acc);
      t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0;
      for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;    
      MP_BARRIER();
      
      /* bytes transfered */
      bytes = sizeof(double)*elems[1]; 
      MP_BARRIER();
      
      /* -------------------------- PUT/GET -------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	  t1 += MP_TIMER()-stime;
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();
      
      if(me == 0) { 
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();    
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	  t4 += MP_TIMER()-stime;	
	}
      }    
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      /* ------------------------ nb PUT/GET ------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,
			       i, &hdl_put)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	    t2 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_put);
	    t3 += MP_TIMER()-stime;
	  } 
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,
			       i, &hdl_get)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	    t5 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_get);
	    t6 += MP_TIMER()-stime;
	  }
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; 
      MP_BARRIER();


      /* ------------------------ Accumulate ------------------------- */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();
	if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			  &ddst[0][0], &stride, &bytes, 0, 0)))
	  ARMCI_Error("armci_acc failed\n",rc);
	t7 += MP_TIMER()-stime;
	
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }

#if 1
      /* See the note below why this part is disabled */
      /* ---------------------- nb-Accumulate ------------------------ */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();    
	if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			    &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc)))
	  ARMCI_Error("armci_nbacc failed\n",rc);
	t8 += MP_TIMER()-stime; stime=MP_TIMER();
	ARMCI_Wait(&hdl_acc);
	t9 += MP_TIMER()-stime;
      
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }
#endif

      /* print timings */
     if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", 
		       bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, 
		       t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes);
    }

    ARMCI_AllFence();
    MP_BARRIER();
    
    if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);}
    destroy_array((void **)ddst);
    destroy_array((void **)dsrc);
}
Ejemplo n.º 14
0
void test_aggregate(int dryrun) {
  
    int i, j, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
    double *ddst_put[MAXPROC];
    double *ddst_get[MAXPROC];
    double *dsrc[MAXPROC];
    armci_hdl_t aggr_hdl_put[MAXPROC];
    armci_hdl_t aggr_hdl_get[MAXPROC];
    armci_hdl_t hdl_put[MAXELEMS];
    armci_hdl_t hdl_get[MAXELEMS];
    armci_giov_t darr;
    void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS];
    int start = 0, end = 0;
    double start_time;
        
    create_array((void**)ddst_put, sizeof(double),2, elems);
    create_array((void**)ddst_get, sizeof(double),2, elems);
    create_array((void**)dsrc, sizeof(double),1, &elems[1]);
    
    for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1);
    for(i=0; i<elems[0]*elems[1]; i++) {
      ddst_put[me][i]=0.0;
      ddst_get[me][i]=0.0;
    }
    
    MP_BARRIER();

    /* only proc 0 does the work */
    if(me == 0) {
      if(!dryrun)printf("Transferring %d doubles (Not an array of %d doubles)\n", MAXELEMS, MAXELEMS);
      
      /* initializing non-blocking handles */
      for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_put[i]);
      for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_get[i]);
      
      /* aggregate handles */
      for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_put[i]);
      for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_get[i]);
      for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_put[i]);
      for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_get[i]);    
      
      bytes = sizeof(double);
      
      /* **************** PUT **************** */    
      /* register put */
      start_time=MP_TIMER();
      start = 0; end = elems[1]; 
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i, 
				 &hdl_put[j]);
	}
	for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]);
      }
      if(!dryrun)printf("%d: Value Put time      = %.2es\n", me, MP_TIMER()-start_time);
 
      /* vector put */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {
	  src_ptr[j] = (void *)&dsrc[me][j];
	  dst_ptr[j] = (void *)&ddst_put[i][me*elems[1]+j];
	}
	darr.src_ptr_array = src_ptr;
	darr.dst_ptr_array = dst_ptr;
	darr.bytes = sizeof(double);
	darr.ptr_array_len = elems[1];
	if((rc=ARMCI_NbPutV(&darr, 1, i, &hdl_put[i])))
	  ARMCI_Error("armci_nbputv failed\n",rc);
      }
      for(i=1; i<nproc; i++) ARMCI_Wait(&hdl_put[i]);
      if(!dryrun)printf("%d: Vector Put time     = %.2es\n", me, MP_TIMER()-start_time);
      
      /* regular put */
      start_time=MP_TIMER();    
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes,
			     i, &hdl_put[j])))
	    ARMCI_Error("armci_nbput failed\n",rc);
	}
	for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]);
      }
      if(!dryrun)printf("%d: Regular Put time    = %.2es\n", me, MP_TIMER()-start_time);
      
      /* aggregate put */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes,
			     i,  &aggr_hdl_put[i])))
	    ARMCI_Error("armci_nbput failed\n",rc);
	}
      }
      for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_put[i]);
      if(!dryrun)printf("%d: Aggregate Put time  = %.2es\n\n", me, MP_TIMER()-start_time);
      
      
      /* **************** GET **************** */    
      
      /* vector get */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {
	  src_ptr[j] = (void *)&dsrc[i][j];
	  dst_ptr[j] = (void *)&ddst_get[me][i*elems[1]+j];
	}
	darr.src_ptr_array = src_ptr;
	darr.dst_ptr_array = dst_ptr;
	darr.bytes = sizeof(double);
	darr.ptr_array_len = elems[1];
	if((rc=ARMCI_NbGetV(&darr, 1, i, &hdl_get[i])))
	  ARMCI_Error("armci_nbgetv failed\n",rc);
	ARMCI_Wait(&hdl_get[i]);
      }
      if(!dryrun)printf("%d: Vector Get time     = %.2es\n", me, MP_TIMER()-start_time);
      
      /* regular get */
      start_time=MP_TIMER();    
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes,
			     i, &hdl_get[j])))
	    ARMCI_Error("armci_nbget failed\n",rc);
	}
	for(j=start; j<end; j++) ARMCI_Wait(&hdl_get[j]);
      }
      if(!dryrun)printf("%d: Regular Get time    = %.2es\n", me, MP_TIMER()-start_time);
      
      /* aggregate get */
      start_time=MP_TIMER();
      for(i=1; i<nproc; i++) {
	for(j=start; j<end; j++) {  
	  ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes,
		      i, &aggr_hdl_get[i]);
	}
      }
      for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_get[i]);
      if(!dryrun)printf("%d: Aggregate Get time  = %.2es\n", me, MP_TIMER()-start_time);
    }

    MP_BARRIER();
    ARMCI_AllFence();
    MP_BARRIER();

    /* Verify */
    if(!(me==0))
      for(j=0; j<elems[1]; j++) {
	if( ARMCI_ABS(ddst_put[me][j]-j*1.001) > 0.1) {
	  ARMCI_Error("aggregate put failed...1", 0);
	}
      }
    MP_BARRIER();
    if(!dryrun)if(me==0) printf("\n  aggregate put ..O.K.\n"); fflush(stdout);

    if(me==0) {
      for(i=1; i<nproc; i++) {
	for(j=0; j<elems[1]; j++) {
	  if( ARMCI_ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) {
	    ARMCI_Error("aggregate get failed...1", 0);
	  }
	}
      }
    }
    MP_BARRIER();
    if(!dryrun)if(me==0) printf("  aggregate get ..O.K.\n"); fflush(stdout);


    ARMCI_AllFence();
    MP_BARRIER();
    
    if(!dryrun)if(me==0){printf("O.K.\n"); fflush(stdout);}
    destroy_array((void **)ddst_put);
    destroy_array((void **)ddst_get);
    destroy_array((void **)dsrc);
}
Ejemplo n.º 15
0
void test_2D()
{
    int i;
    int src, dst;
    int ierr;
    double *buf;
    void *ptr[MAXPROC], *get_ptr[MAXPROC];

    /* find who I am and the dst process */
    src = me;
    
#ifdef MALLOC_LOC
    if(me == 0) {
        buf = (double *)ARMCI_Malloc_local(SIZE * SIZE * sizeof(double));
        assert(buf != NULL);
    }
#else
    if(me == 0) {
        buf = (double *)malloc(SIZE * SIZE * sizeof(double));
        assert(buf != NULL);
    }
#endif

    ierr = ARMCI_Malloc(ptr, (SIZE * SIZE * sizeof(double)));
    assert(ierr == 0); assert(ptr[me]);
    ierr = ARMCI_Malloc(get_ptr, (SIZE * SIZE * sizeof(double)));
    assert(ierr == 0); assert(get_ptr[me]);
    
    /* ARMCI - initialize the data window */
    fill_array(ptr[me], SIZE*SIZE, me);
    fill_array(get_ptr[me], SIZE*SIZE, me);

    MP_BARRIER();
    
    /* only the proc 0 doest the work */
    /* print the title */
    if(me == 0) {
        if(!CHECK_RESULT){
           printf("  section               get                 put");
           printf("                 acc\n");
           printf("bytes   loop       sec      MB/s       sec      MB/s");
           printf("       sec      MB/s\n");
           printf("------- ------  --------  --------  --------  --------");
           printf("  --------  --------\n");
           fflush(stdout);
        }
        
        for(i=0; i<CHUNK_NUM; i++) {
            int loop;
            int bytes = chunk[i] * chunk[i] * sizeof(double);

            double t_get = 0, t_put = 0, t_acc = 0;
            double latency_get, latency_put, latency_acc;
            double bandwidth_get, bandwidth_put, bandwidth_acc;
            
            loop = SIZE / chunk[i];
            if(loop<2)loop=2;

            for(dst=1; dst<nproc; dst++) {
                /* strided get */
                fill_array(buf, SIZE*SIZE, me*10);
                t_get += time_get((double *)(get_ptr[dst]), (double *)buf,
                                 chunk[i], loop, dst, 1);
 
                /* strided put */
                fill_array(buf, SIZE*SIZE, me*10);
                t_put += time_put((double *)buf, (double *)(ptr[dst]),
                                 chunk[i], loop, dst, 1);
                
                /* strided acc */
                fill_array(buf, SIZE*SIZE, me*10);
                t_acc += time_acc((double *)buf, (double *)(ptr[dst]),
                                 chunk[i], loop, dst, 1);
            }
            
            latency_get = t_get/(nproc - 1);
            latency_put = t_put/(nproc - 1);
            latency_acc = t_acc/(nproc - 1);
            
            bandwidth_get = (bytes * (nproc - 1) * 1e-6)/t_get;
            bandwidth_put = (bytes * (nproc - 1) * 1e-6)/t_put;
            bandwidth_acc = (bytes * (nproc - 1) * 1e-6)/t_acc;

            /* print */
            if(!CHECK_RESULT)printf("%d\t%d\t%.2e  %.2e  %.2e  %.2e  %.2e  %.2e\n",
                       bytes, loop, latency_get, bandwidth_get,
                       latency_put, bandwidth_put, latency_acc, bandwidth_acc);
        }
    }
    else sleep(3);
    
    ARMCI_AllFence();
    MP_BARRIER();

    /* cleanup */
    ARMCI_Free(get_ptr[me]);
    ARMCI_Free(ptr[me]);

#ifdef MALLOC_LOC
    if(me == 0) ARMCI_Free_local(buf);
#else
    if(me == 0) free(buf);
#endif

}
Ejemplo n.º 16
0
void lu(int n, int bs, int me)
{
  int i, il, j, jl, k, kl;
  int I, J, K;
  double *A, *B, *C, *D;
  int dimI, dimJ, dimK;
  int strI, strJ, strK;
  unsigned int t1, t2, t3, t4, t11, t22;
  int diagowner, destp, hc, m;
  double *dbuf;
  armci_hdl_t handle[2*MAXPROC];
  int saved[MAXPROC];  
  
  dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double));

  for (k=0, K=0; k<n; k+=bs, K++) {
    kl = k + bs; 
    if (kl > n) {
      kl = n;
      strK = kl - k;
    } else {
      strK = bs;
    }
    
    /* factor diagonal block */
    diagowner = block_owner(K, K);
    if (diagowner == me) {
      A = a[K+K*nblocks]; 
      lu0(A, strK, strK); /* impl algo on this diag block */
    }
    MP_BARRIER(); 
    
    /* divide column k by diagonal block */
    if(block_owner(K, K) == me)
      D = a[K+K*nblocks];
    else {
      D = dbuf;
      get_remote(D, K, K);
    }
    
    for (i=kl, I=K+1; i<n; i+=bs, I++) {
      if (block_owner(I, K) == me) {  /* parcel out blocks */
	il = i + bs; 
	if (il > n) {
	  il = n;
	  strI = il - i;
	} else {
	  strI = bs;
	}
	A = a[I+K*nblocks]; 
	bdiv(A, D, strI, strK, strI, strK);
	
	/* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/
	memset (saved, 0, sizeof(saved));
	for (m = K+1; m < nblocks; m++) {
	    destp = block_owner (I, m);
	    if (destp != me && !saved[destp]) {
	      ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL);
	      saved[destp] = 1;
	    }
	}
      }
    } /* end of for (i=k1, I=K+1...) */
    
    /* modify row k by diagonal block */
    for (j=kl, J=K+1; j<n; j+=bs, J++) {
      if (block_owner(K, J) == me) {  /* parcel out blocks */
	jl = j+bs; 
	if (jl > n) {
	  jl = n;
	  strJ = jl - j;
	} else {
	  strJ = bs;
	}
	A = a[K+J*nblocks];
	bmodd(D, A, strK, strJ, strK, strK);
     
	/* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/
        memset (saved, 0, sizeof(saved));
        for (m = K+1; m < nblocks; m++) {
	  destp = block_owner (m, J);
	  if (destp != me  && !saved[destp]) {
	    ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL);
	    saved[destp] = 1;
	  }
	}
      }      
    }
        
    ARMCI_WaitAll();
    ARMCI_AllFence();
    MP_BARRIER();
    /* modify subsequent block columns */
    
    for (i=kl, I=K+1; i<n; i+=bs, I++) {
      il = i+bs; 
      if (il > n) {
	il = n;
	strI = il - i;
      } else {
	strI = bs;
      }

      for (j=kl, J=K+1; j<n; j+=bs, J++) {
	jl = j + bs; 
	if (jl > n) {
	  jl = n;
	  strJ= jl - j;
	} else {
	  strJ = bs;
	  }
	if (block_owner(I, J) == me) {  /* parcel out blocks */
	  if(block_owner(I,K) == me)
	    A = a[I+K*nblocks];
	  else {
	    A = bufc[me*nblocks+I];
          }
	  
	  if(block_owner(K,J) == me)
	    B = a[K+J*nblocks];
	  else
	    B = bufr[me*nblocks + J];
	    
	  C = a[I+J*nblocks];
	  bmod(A, B, C, strI, strJ, strK, strI, strK, strI);
	}
      }
    }
  }
  ARMCI_Free_local(dbuf);
}
static int sparse_initialize(int *n, int *non_zero, int **row_ind,
                             int **col_ind, double **values, double **vec,
                             double **svec) {

    int i, j, rc, max, *row_ind_tmp=NULL, *tmp_indices=NULL;
    double *tmp_values=NULL;
    unsigned long len;
    FILE *fp=NULL;

    /* Broadcast order of matrix */
    if(me==0) {
        if((fp=fopen("Sparse-MPI/av41092.rua.data", "r")) == NULL)
            ARMCI_Error("Error: Input file not found", me);
        fortran_indexing = 1; /* This is 1 for Harwell-Boeing format matrices */
        fscanf(fp, "%d", n);
        if(*n%nproc)
            ARMCI_Error("# of rows is not divisible by # of processors", nproc);
        if(*n > ROW)
            ARMCI_Error("order is greater than defined variable ROW", ROW);
    }
    len = sizeof(int);
    armci_msg_brdcst(n, len, 0);

    /* Broad cast number of non_zeros */
    if(me==0) fscanf(fp, "%d", non_zero);
    armci_msg_brdcst(non_zero, len, 0);

    /* Broadcast row indices */
    len = (*n+1)*sizeof(int);
    row_ind_tmp = (int *)malloc(len);
    if(me==0)for(i=0; i<*n+1; i++) {
            fscanf(fp, "%d", &row_ind_tmp[i]);
            if(fortran_indexing) --row_ind_tmp[i];
        }
    armci_msg_brdcst(row_ind_tmp, len, 0);

    load_balance(*n, *non_zero, row_ind_tmp);

    /* find how much temporary storage is needed at the maximum */
    if(me==0) {
        for(max=-1,j=0; j<nproc; j++) if(max<proc_nz_list[j]) max=proc_nz_list[j];
        if(max<0) ARMCI_Error(" max cannot be negative", max);
    }

    /* Broadcast the maximum number of elements */
    len = sizeof(int);
    armci_msg_brdcst(&max, len, 0);

    /* create the Sparse MAtrix Array */
    if(me==0) printf("  Creating ValueArray (CompressedSparseMatrix) ...\n\n");
    create_array((void**)col_ind, sizeof(int), 1, &max);

    /* create the column subscript array */
    if(me==0) printf("  Creating Column Subscript Array ... \n\n");
    create_array((void**)values, sizeof(double), 1, &max);

    /* create the x-vector and the solution vector */
    if(me==0) printf("  Creating Vectors ... \n\n");
    create_array((void**)vec,  sizeof(double),1, &max);
    create_array((void**)svec, sizeof(double),1, &max);
    armci_msg_barrier();


    /* Process 0 distributes the column indices and non_zero values to
       respective processors*/
    if(me == 0) {
        tmp_indices = (int *)malloc(max*sizeof(int));
        tmp_values  = (double *)malloc(max*sizeof(double));

        for(j=0; j<nproc; j++) {
            for(i=0; i<proc_nz_list[j]; i++) {
                fscanf(fp, "%d", &tmp_indices[i]);
                if(fortran_indexing) --tmp_indices[i];
            }
            /* rc = fread(tmp_indices, sizeof(int), proc_nz_list[j], fp); */
            if((rc=ARMCI_Put(tmp_indices, col_ind[j], proc_nz_list[j]*sizeof(int), j)))
                ARMCI_Error("armci_nbput failed\n",rc);
        }
        for(j=0; j<nproc; j++) {
            for(i=0; i<proc_nz_list[j]; i++) fscanf(fp, "%lf", &tmp_values[i]);
            if((rc=ARMCI_Put(tmp_values, values[j], proc_nz_list[j]*sizeof(double), j)))
                ARMCI_Error("armci_nbput failed\n",rc);
        }
    }
    ARMCI_AllFence();
    armci_msg_barrier();
    ARMCI_AllFence();

    /* initializing x-vector */
    if(me==0) for(i=0; i<proc_nz_list[me]; i++) vec[me][i] = (i+1);
    else for(i=0; i<proc_nz_list[me]; i++) vec[me][i]=me*proc_nz_list[me-1]+(i+1);

#if 0
    if(me==0) {
        printf("max = %d\n", max);
        for(i=0; i<max; i++)  printf("%.1f ", values[me][i]);
        printf("\n");
    }
#endif

    *row_ind = row_ind_tmp;
    if(me==0) {
        free(tmp_indices);
        free(tmp_values);
        fclose(fp);
    }
    return 0;
}
Ejemplo n.º 18
0
int main(int argc, char **argv)
{
int i,peer,j;
cpu_set_t mycpuid,new_mask;
char str[CPU_SETSIZE];
int rrr;
char cid[8];
extern char * cpuset_to_cstr(cpu_set_t *mask, char *str);
extern int cstr_to_cpuset(cpu_set_t *mask, const char* str);
gpc_hdl_t nbh;
char rheader[100];
int hlen, rhlen, rhsize;
int rdsize;
int rem;
void *header=&rem;
int locval=0;
void *loc=&locval;
int right;

    MPI_Init(&argc,&argv);
    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
    if(nprocs<2){
      printf("\ncan run only on >=2 procs\n");
      MPI_Finalize();
      exit(1);
    }     
    right = (me+1)%nprocs; 
    hlen=sizeof(header);
    bzero(rheader,100);
    rhlen = hlen;

    ARMCI_Init();
    accloop=atoi(argv[1]);
    rem=accloop;
    myptrs = (char **)malloc(sizeof(char *)*nprocs);
    ARMCI_Malloc((void **)myptrs,size);

    MPI_Barrier(MPI_COMM_WORLD);

    gpcwork_memcpy = ARMCI_Gpc_register(gpc_work_handler_memcpy);
    gpcwork_ddot =ARMCI_Gpc_register(gpc_work_handler_ddot);
    gpcwork_daxpy = ARMCI_Gpc_register(gpc_work_handler_daxpy);
    gpcwork_dgemm = ARMCI_Gpc_register(gpc_work_handler_dgemm);
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);

    ARMCI_Gpc_init_handle(&nbh);
    if(ARMCI_Gpc_exec(gpcwork_memcpy, right, &header, hlen, loc, sizeof(int), 
                            rheader, rhlen,loc, sizeof(int), &nbh))
       fprintf(stderr,"ARMCI_Gpc_exec failed\n");
    {
      int m,n,k;
      char notr='n';
      DoubleComplex ZERO;
      usleep(100);
      ZERO.real=0.;ZERO.imag=0.;
      m=n=k=DGS;
      t0=MPI_Wtime();
#ifdef DGEMM_WORK
      for(j=0;j<4*15;j++){
         c_alpha=c_alpha+j*rand();
         dgemm_(&notr,&notr,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1);
      }
#elif IUNIT_WORK 
      for(j=0;j<2*LOOP*100;j++){
        for(i=0;i<LOOP*100;i++){
          tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2];
        }
      }
#elif DAXPY_WORK
      for(j=0;j<tmp_loop*80;j++){
        alpha=alpha+j*rand();
        daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE);
      }
#endif

      t1=MPI_Wtime()-t0;
      printf("\n%d:Compute_During_memcpy %d %f\n",me,accloop,t1);
    }

    ARMCI_Gpc_wait(&nbh);
    
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);

    ARMCI_Gpc_init_handle(&nbh);
    if(ARMCI_Gpc_exec(gpcwork_ddot, right, &header, hlen, loc, sizeof(int), 
                            rheader, rhlen,loc, sizeof(int), &nbh))
       fprintf(stderr,"ARMCI_Gpc_exec failed\n");
    {
      int m,n,k;
      char notr='n';
      DoubleComplex ZERO;
      usleep(100);
      ZERO.real=0.;ZERO.imag=0.;
      m=n=k=DGS;
      t0=MPI_Wtime();
#ifdef DGEMM_WORK
      for(j=0;j<4*15;j++){
         c_alpha=c_alpha+j*rand();
         dgemm_(&notr,&notr,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1);
      }
#elif IUNIT_WORK 
      for(j=0;j<2*LOOP*100;j++){
        for(i=0;i<LOOP*100;i++){
          tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2];
        }
      }
#elif DAXPY_WORK
      for(j=0;j<tmp_loop*80;j++){
        alpha=alpha+j*rand();
        daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE);
      }
#endif
      t1=MPI_Wtime()-t0;
      printf("\n%d:Compute_During_Ddot %d %f\n",me,accloop,t1);
    }
    ARMCI_Gpc_wait(&nbh);

    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);

    ARMCI_Gpc_init_handle(&nbh);
    if(ARMCI_Gpc_exec(gpcwork_daxpy, right, &header, hlen, loc, sizeof(int), 
                            rheader, rhlen,loc, sizeof(int), &nbh))
       fprintf(stderr,"ARMCI_Gpc_exec failed\n");
    {
      int m,n,k;
      char notr='n';
      DoubleComplex ZERO;
      usleep(100);
      ZERO.real=0.;ZERO.imag=0.;
      m=n=k=DGS;
      t0=MPI_Wtime();
#ifdef DGEMM_WORK
      for(j=0;j<4*15;j++){
         c_alpha=c_alpha+j*rand();
         dgemm_(&notr,&notr,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1);
      }
#elif IUNIT_WORK 
      for(j=0;j<2*LOOP*100;j++){
        for(i=0;i<LOOP*100;i++){
          tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2];
        }
      }
#elif DAXPY_WORK
      for(j=0;j<tmp_loop*80;j++){
        alpha=alpha+j*rand();
        daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE);
      }
#endif
      t1=MPI_Wtime()-t0;
      printf("\n%d:Compute_During_Daxpy %d %f\n",me,accloop,t1);
    }
    ARMCI_Gpc_wait(&nbh);

    MPI_Barrier(MPI_COMM_WORLD);

    ARMCI_Gpc_init_handle(&nbh);
    if(ARMCI_Gpc_exec(gpcwork_dgemm, right, &header, hlen, loc, sizeof(int), 
                            rheader, rhlen,loc, sizeof(int), &nbh))
       fprintf(stderr,"ARMCI_Gpc_exec failed\n");
    {
      int m,n,k;
      char notr='n';
      DoubleComplex ZERO;
      usleep(100);
      ZERO.real=0.;ZERO.imag=0.;
      m=n=k=DGS;
      t0=MPI_Wtime();
#ifdef DGEMM_WORK
      for(j=0;j<4*15;j++){
         c_alpha=c_alpha+j*rand();
         dgemm_(&notr,&notr,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1);
      }
#elif IUNIT_WORK 
      for(j=0;j<2*LOOP*100;j++){
        for(i=0;i<LOOP*100;i++){
          tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2];
        }
      }
#elif DAXPY_WORK
      for(j=0;j<tmp_loop*80;j++){
        alpha=alpha+j*rand();
        daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE);
      }
#endif
      t1=MPI_Wtime()-t0;
      printf("\n%d:Compute_During_Dgemm %d %f\n",me,accloop,t1);
    }
    ARMCI_Gpc_wait(&nbh);

    MPI_Barrier(MPI_COMM_WORLD);

    ARMCI_AllFence();

    ARMCI_Finalize();
    MPI_Finalize();
}