示例#1
0
文件: simple.c 项目: arnolda/scafacos
int main(int argc, char **argv)
{
int k,i;
double **myptrs[10];
double t0,t1,tget=0,tnbget=0,tput=0,tnbput=0,tnbwait=0,t2=0;
#if PORTALS
    ARMCI_NetInit();
#endif
    MPI_Init(&argc,&argv);
    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
    ARMCI_Init();
    ARMCI_Init();
    for(k=0;k<10;k++){
      myptrs[k] = (double **)malloc(sizeof(double *)*nprocs);
      ARMCI_Malloc((void **)myptrs[k],400000*LOOP*sizeof(double)); 
      for(i=0;i<LOOP;i++)myptrs[k][me][i]=me+0.414;
      MPI_Barrier(MPI_COMM_WORLD);
      for(i=0;i<LOOP;i++){
        ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs);
        /*if(myptrs[k][me][i]!=0.414+(me+1)%nprocs)ARMCI_Error("errr",myptrs[k][me][i]);*/
      }
      t0=t1=tget=tnbget=tput=tnbput=tnbwait=t2=0;
      t0 = MPI_Wtime(); 
      for(i=0;i<LOOP;i++){
        ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs);
      }
      t1 = MPI_Wtime(); 
      printf("\nGet Latency=%lf\n",1e6*(t1-t0)/LOOP);fflush(stdout);
      t1=t0=0;
      for(i=0;i<LOOP;i++){
        armci_hdl_t nbh;
        ARMCI_INIT_HANDLE(&nbh);
        t0 = MPI_Wtime(); 
        ARMCI_NbGet(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs,&nbh);
        t1 = MPI_Wtime(); 
        ARMCI_Wait(&nbh);
        t2 = MPI_Wtime();
        tnbget+=(t1-t0);
        tnbwait+=(t2-t1);
      }
      printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout);
      MPI_Barrier(MPI_COMM_WORLD);
    }
    for(k=0;k<10;k++)ARMCI_Free(myptrs[k][me]);
    MPI_Barrier(MPI_COMM_WORLD);
    ARMCI_Finalize();
    ARMCI_Finalize();
    MPI_Finalize();
    
}
示例#2
0
int main(int argc, char* argv[])
{
    int ndim;

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

    if(me==0){
       printf("ARMCI test program for lock(%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    test_lock();

    MP_BARRIER();
    if(me==0){printf("test passed\n"); fflush(stdout);}
    sleep(2);

    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
示例#3
0
int main( int argc, char **argv)
{   
    
  MP_INIT(argc,argv);
  MP_MYID(&me);
  MP_PROCS(&nproc);
    
    if(nproc < 2) {
        if(me == 0)
            fprintf(stderr,
                    "USAGE: 2 <= processes < %d\n", nproc);
        MP_BARRIER();
        MP_FINALIZE();
        exit(0);
    }

    if(me == 0){
       printf("Test of ARMCI Wrappers to Basic Message Passing Operations\n");
       fflush(stdout);
    }
   
    /* initialize ARMCI */
    ARMCI_Init();

    MP_BARRIER();
   
    TestGlobals();
   
    /* done */
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}   
示例#4
0
int main(int argc, char **argv) {
    /* int heap=300000, stack=300000; */
    int me, nprocs;
    
    /* Step1: Initialize Message Passing library */
    armci_msg_init(&argc, &argv);

    /* Step2: Initialize ARMCI */
    ARMCI_Init();
    
    /* Step3: Initialize Memory Allocator (MA) */
    /*bjp
    if(! MA_init(C_DBL, stack, heap) ) ARMCI_Error("MA_init failed",stack+heap);
    */

    me     = armci_msg_me();
    nprocs = armci_msg_nproc();
    if(me==0) {
       printf("\nUsing %d processes\n\n", nprocs); fflush(stdout);
    }
    
       
    TRANSPOSE1D();
    
    if(me==0)printf("\nTerminating ..\n");
    ARMCI_Finalize();
    
    armci_msg_finalize();    
    return(0);
}
示例#5
0
int main(int argc, char *argv[])
{
  int ndim;

  armci_msg_init(&argc, &argv);
  ARMCI_Init_args(&argc, &argv);
  nproc = armci_msg_nproc();
  me = armci_msg_me();

  ARMCI_Barrier();
  if (me == 0) {
    printf("\nTesting armci_notify\n");
    fflush(stdout);
    sleep(1);
  }
  ARMCI_Barrier();

  for (ndim = 1; ndim <= MAXDIMS; ndim++) {
    test_notify(ndim);
  }
  ARMCI_Barrier();

  ARMCI_Finalize();
  armci_msg_finalize();
  return(0);
}
示例#6
0
文件: perf2.c 项目: jeffhammond/ga
int main(int argc, char **argv)
{
    armci_msg_init(&argc,&argv);
    ARMCI_Init_args(&argc, &argv);
    me = armci_msg_me();
    nproc = armci_msg_nproc();

    /* This test only works for two processes */

    assert(nproc == 2);

    if (0 == me) {
        printf("msg size (bytes)     avg time (us)    avg b/w (MB/sec)\n");
    }

    if (0 == me) {
        printf("#PNNL comex Put Test\n");
    }
    contig_test(MAX_MESSAGE_SIZE, PUT);

    if (0 == me) {
        printf("#PNNL comex Get Test\n");
    }
    contig_test(MAX_MESSAGE_SIZE, GET);
   
    if (0 == me) {
        printf("#PNNL comex Accumulate Test\n");
    }
    contig_test(MAX_MESSAGE_SIZE, ACC);
    
    ARMCI_Finalize();
    armci_msg_finalize();

    return 0;
}
示例#7
0
文件: perf.c 项目: arnolda/scafacos
int main(int argc, char **argv)
{

    ARMCI_NetInit();
  MP_INIT(argc,argv);
  MP_MYID(&me);
  MP_PROCS(&nproc);

    if(nproc < 2 || nproc> MAXPROC) {
        if(me == 0)
            fprintf(stderr,
                    "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc);
        MP_BARRIER();
        MP_FINALIZE();
        exit(0);
    }
    
    /* initialize ARMCI */
    ARMCI_Init();

    if(!me)printf("\n             Performance of Basic Blocking Communication Operations\n");
    MP_BARRIER();
    
    CHECK_RESULT=1;
    if(!me)printf("\n\t\t\tContiguous Data Transfer\n");
    test_1D();
    CHECK_RESULT=0;

    /* test 1 dimension array */
    if(!me)printf("\n\t\t\tContiguous Data Transfer\n");
    test_1D();
    
    /* test 2 dimension array */
    if(!me)printf("\n\t\t\tStrided Data Transfer\n");
    test_2D();

    MP_BARRIER();
    if(me == 0){
       if(warn_accuracy) 
          printf("\nWARNING: Your timer does not have sufficient accuracy for this test (%d)\n",warn_accuracy);
       printf("\n\n------------ Testing the same data transfer for correctness ----------\n");
       fflush(stdout);
    }

    MP_BARRIER();
    CHECK_RESULT=1;
    if(!me)printf("\n\t\t\tContiguous Data Transfer\n");
    test_1D();
    if(me == 0) printf("OK\n");
    MP_BARRIER();
    if(!me)printf("\n\t\t\tStrided Data Transfer\n");
    test_2D();
    if(me == 0) printf("OK\n\n\nTests Completed.\n");
    MP_BARRIER();

    /* done */
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}    
示例#8
0
int main(int argc, char ** argv) {
  int    rank, nproc, val, i;
  void **base_ptrs;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting ARMCI mutex read-modify-write test with %d processes\n", nproc);

  base_ptrs = malloc(nproc*sizeof(void*));

  ARMCI_Create_mutexes(rank == 0 ? 1 : 0);
  ARMCI_Malloc(base_ptrs, (rank == 0) ? sizeof(int) : 0); // Proc 0 has a shared int

  if (rank == 0) {
    val = 0;
    ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0);
  }

  ARMCI_Barrier();

  for (i = 0; i < NITER; i++) {
    ARMCI_Lock(0, 0);

    ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0);
    val += ADDIN;
    ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0);

    ARMCI_Unlock(0, 0);
  }

  printf(" + %3d done\n", rank);
  fflush(NULL);

  ARMCI_Barrier();

  if (rank == 0) {
    ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0);

    if (val == ADDIN*nproc*NITER)
      printf("Test complete: PASS.\n");
    else
      printf("Test complete: FAIL.  Got %d, expected %d.\n", val, ADDIN*nproc*NITER);
  }

  ARMCI_Free(base_ptrs[rank]);
  ARMCI_Destroy_mutexes();
  free(base_ptrs);

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
示例#9
0
int main(int argc, char ** argv) {
  MPI_Init(&argc, &argv);
  ARMCI_Init();

  ARMCI_Get(NULL, NULL, 1, 0);

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
示例#10
0
int main(int argc, char** argv)
{
      MPI_Init(&argc, &argv);
      MPI_Comm_size(MPI_COMM_WORLD, &nproc);
      MPI_Comm_rank(MPI_COMM_WORLD, &me);
      if(me==0)printf("Testing IPCs (%d MPI processes)\n\n",nproc);
      ARMCI_Init();
      test();
      ARMCI_Finalize();
      MPI_Finalize();
      return 0;

}
示例#11
0
int main(int argc, char ** argv) {
  int    rank, nproc, i;
  int   *buf;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting ARMCI GOP test with %d processes\n", nproc);

  buf = malloc(DATA_SZ*sizeof(int));

  if (rank == 0) printf(" - Testing ABSMIN\n");

  for (i = 0; i < DATA_SZ; i++)
    buf[i] = (rank+1) * ((i % 2) ? -1 : 1);

  armci_msg_igop(buf, DATA_SZ, "absmin");

  for (i = 0; i < DATA_SZ; i++)
    if (buf[i] != 1) {
      printf("Err: buf[%d] = %d expected 1\n", i, buf[i]);
      ARMCI_Error("Fail", 1);
    }

  if (rank == 0) printf(" - Testing ABSMAX\n");

  for (i = 0; i < DATA_SZ; i++)
    buf[i] = (rank+1) * ((i % 2) ? -1 : 1);

  armci_msg_igop(buf, DATA_SZ, "absmax");

  for (i = 0; i < DATA_SZ; i++)
    if (buf[i] != nproc) {
      printf("Err: buf[%d] = %d expected %d\n", i, buf[i], nproc);
      ARMCI_Error("Fail", 1);
    }

  free(buf);

  if (rank == 0) printf("Pass.\n");

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
示例#12
0
文件: ddi_armci.c 项目: ryanolson/ddi
void DDI_ARMCI_Finalize() {
  int code;
  const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_COMM_WORLD);
  
#if defined DDI_ARMCI_FREE
  code = ARMCI_Free((void*)(gv(armci_mem_addr)[comm->me]));
  if (code > 0) fprintf(stderr,"ARMCI_Free(%p) failed: %i",gv(armci_mem_addr)[comm->me]);

  code = ARMCI_Free((void*)(gv(armci_cnt_addr)[comm->me]));
  if (code > 0) fprintf(stderr,"ARMCI_Free(%p) failed: %i",gv(armci_cnt_addr)[comm->me]);
#endif

  code = ARMCI_Destroy_mutexes();
  if (code > 0) fprintf(stderr,"ARMCI_Destory_mutexes failed: %i",code);

  ARMCI_Finalize();
}
示例#13
0
int main(int argc, char* argv[])
{

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

/*    printf("nproc = %d, me = %d\n", nproc, me);*/
    
    if( (nproc<MINPROC || nproc>MAXPROC) && me==0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\n Testing ARMCI Groups!\n\n");
      fflush(stdout);
    }

    test_groups();
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\n Collective groups: Success!!\n"); fflush(stdout);}
    sleep(2);

#ifdef ARMCI_GROUP
    test_groups_noncollective();

    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\n Non-collective groups: Success!!\n"); fflush(stdout);}
    sleep(2);
#endif
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
示例#14
0
int main(int argc, char **argv)
{
    int i, j, rank, nranks, msgsize, dest;
    int xdim, ydim;
    long bufsize;
    double **buffer;
    double t_start, t_stop, t_latency;
    int count[2], src_stride, trg_stride, stride_level;
    int provided;

    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    ARMCI_Init_args(&argc, &argv);

    ARMCI_Barrier();
 
    int me = armci_msg_me();
    int node = armci_domain_my_id(ARMCI_DOMAIN_SMP);

    printf("MPI_Rank: %d, \
            armci_msg_nproc: %d \
            armci_msg_me: %d, \
            armci_domain_id: %d, \
            armci_domain_same_id: %d,\ 
            armci_domain_my_id: %d, \ 
            armci_domain_count: %d, \
            armci_domain_nprocs: %d, \
            armci_domain_glob_proc_id: %d \n",
              rank, armci_msg_nproc(), me, armci_domain_id(ARMCI_DOMAIN_SMP, me),
              armci_domain_same_id(ARMCI_DOMAIN_SMP, me), armci_domain_my_id(ARMCI_DOMAIN_SMP),
              armci_domain_count(ARMCI_DOMAIN_SMP), armci_domain_nprocs(ARMCI_DOMAIN_SMP, node),
              armci_domain_glob_proc_id(ARMCI_DOMAIN_SMP, node, 0));
    fflush(stdout);

    ARMCI_Free((void *) buffer[rank]);

    ARMCI_Finalize();

    MPI_Finalize();

    return 0;
}
示例#15
0
int main(int argc, char* argv[])
{
ARMCI_NetInit();

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

    if(nproc < 2 || nproc> MAXPROC) {
      if(me == 0)
	fprintf(stderr,
		"USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc);
      MP_BARRIER();
      MP_FINALIZE();
      exit(0);
    }

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\n put/get/acc requests (Time in secs)\n\n");
      fflush(stdout);
    }

    test_perf_nb(1);
    test_perf_nb(0);
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}
    sleep(2);
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
示例#16
0
文件: put.c 项目: brog2610/quinoa
int main(int argc, char * argv[]) {
  void *baseAddress[MAX_PROCESSORS];
  char *local;
  int thisImage;

  int iter = 100, size;
  double startTime, endTime;
  int i;

  // initialize
  ARMCI_Init();
  ARMCI_Myid(&thisImage);

  // allocate data (collective operation)
  ARMCI_Malloc(baseAddress, MAX_BUF_SIZE*sizeof(char));
  local = (char *)ARMCI_Malloc_local(MAX_BUF_SIZE*sizeof(char));

  ARMCI_Barrier();
  ARMCI_Migrate();

  if (thisImage == 0) {
    for(size = 1; size <= MAX_BUF_SIZE; size = size<<1){
      startTime = CkWallTimer();
      for(i = 0; i < iter; i++){
        ARMCI_Put(local, baseAddress[1], size, 1);
      }
      ARMCI_Fence(1);
      endTime = CkWallTimer();
      printf("%d: %f us\n", size, (endTime-startTime)*1000);
    }
    ARMCI_Barrier();
  } else if (thisImage == 1) {
    ARMCI_Barrier();
  }

  
  ARMCI_Free(baseAddress[thisImage]);
  ARMCI_Free_local(local);
  // finalize
  ARMCI_Finalize();
  return 0;
}
示例#17
0
int main(int argc, char ** argv) {
  int     rank, nproc, test_iter;
  void ***base_ptrs;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting ARMCI memory allocation test with %d processes\n", nproc);

  base_ptrs = malloc(sizeof(void**)*NUM_ITERATIONS);

  // Perform a pile of allocations
  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
    if (rank == 0) printf(" + allocation %d\n", test_iter);

    base_ptrs[test_iter] = malloc(sizeof(void*)*nproc);
    ARMCI_Malloc((void**)base_ptrs[test_iter], (test_iter % 4 == 0) ? 0 : DATA_SZ);
  }

  ARMCI_Barrier();

  // Free all allocations
  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
    if (rank == 0) printf(" + free %d\n", test_iter);

    ARMCI_Free(((void**)base_ptrs[test_iter])[rank]);
    free(base_ptrs[test_iter]);
  }

  free(base_ptrs);

  if (rank == 0) printf("Test complete: PASS.\n");

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
int main(int argc, char* argv[])
{

    armci_msg_init(&argc, &argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();

    /*    printf("nproc = %d, me = %d\n", nproc, me);*/

    if(nproc>MAXPROC && me==0)
        ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0) {
        printf("ARMCI test program (%d processes)\n",nproc);
        fflush(stdout);
        sleep(1);
    }

    ARMCI_Init();

    if(me==0) {
        printf("\n  Performing Sparse Matrix-Vector Multiplication ...\n\n");
        fflush(stdout);
    }
    test_sparse();

    ARMCI_AllFence();
    armci_msg_barrier();
    if(me==0) {
        printf("\nSuccess!!\n");
        fflush(stdout);
    }
    sleep(2);

    armci_msg_barrier();
    ARMCI_Finalize();
    armci_msg_finalize();
    return(0);
}
示例#19
0
int main(int argc, char *argv[])
{
  ARMCI_Init_args(&argc, &argv);
  nproc = armci_msg_nproc();
  me = armci_msg_me();

  /*    printf("nproc = %d, me = %d\n", nproc, me);*/

  if (nproc > MAXPROC && me == 0) {
    ARMCI_Error("Test works for up to %d processors\n", MAXPROC);
  }

  if (me == 0) {
    printf("ARMCI test program (%d processes)\n", nproc);
    fflush(stdout);
    sleep(1);
  }

  if (me == 0) {
    printf("\nAggregate put/get requests\n\n");
    fflush(stdout);
  }
  test_aggregate(1); /* cold start */
  test_aggregate(0); /* warm start */

  ARMCI_AllFence();
  ARMCI_Barrier();
  if (me == 0) {
    printf("\nSuccess!!\n");
    fflush(stdout);
  }
  sleep(2);

  ARMCI_Barrier();
  ARMCI_Finalize();
  armci_msg_finalize();
  return(0);
}
示例#20
0
int main(int argc, char* argv[])
{

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

/*    printf("nproc = %d, me = %d\n", nproc, me);*/
    
    if(nproc>MAXPROC && me==0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\nAggregate put/get requests\n\n");
      fflush(stdout);
    }
    test_aggregate(1); /* cold start */
    test_aggregate(0); /* warm start */
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}
    sleep(2);
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
示例#21
0
文件: test2.c 项目: arnolda/scafacos
int main(int argc, char* argv[])
{
    int i;
    struct timeval start_time[14];
    struct timeval stop_time[14];
    /*
      char * test_name[14] = {
      "dim", "nbdim", "vec_small", "acc",
      "vector", "vector_acc", "fetch_add",
      "swap", "rput", "aggregate", "implicit",
      "memlock", "acc_type", "collective"
      };
      int test_flags[14] = {
      1, 1, 1, 1,
      1, 1, 1,
      1, 1, 0, 1,
      1, 1, 1
      };
    */
    char * test_name[2] = { "acc_type", "collective" };
    int test_flags[2]   = { 1, 1 };

#define TEST_ACC_TYPE   0
#define TEST_COLLECTIVE 1

    MP_INIT(argc, argv);
    ARMCI_Init();
    MP_PROCS(&nproc);
    MP_MYID(&me);

    if(nproc > MAXPROC && me == 0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me == 0)
    {
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }    

    gettimeofday(&start_time[TEST_ACC_TYPE],NULL);
    if(test_flags[TEST_ACC_TYPE] == 1)
    {
       if(me == 0)
       {
          printf("\nTesting Accumulate Types\n");
          fflush(stdout);
       }
       
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_INT\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_INT);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_LNG\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_LNG);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_FLT\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_FLT);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_DBL\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_DBL);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_CPL\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_CPL);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_DCP\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_DCP);
       ARMCI_AllFence();
       MP_BARRIER();
    }
    gettimeofday(&stop_time[TEST_ACC_TYPE],NULL);

    gettimeofday(&start_time[TEST_COLLECTIVE],NULL);
    if(test_flags[TEST_COLLECTIVE] == 1)
    {
       if(me == 0)
       {
          printf("\nTesting Collective Types\n");
          fflush(stdout);
       }
       if(me == 0)
       {
          printf("Test Collective ARMCI_INT\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_INT);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_LONG\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_LONG);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_FLOAT\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_FLOAT);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_DOUBLE\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_DOUBLE);
       MP_BARRIER();
    }
    gettimeofday(&stop_time[TEST_COLLECTIVE],NULL);
    
    if(me == 0)
    {
       printf("Accumulate and Collective tests passed\n");
       fflush(stdout);
    }

    if(me == 0)
    {
       printf("Testcase runtime\n");
       printf("Name,Time(seconds)\n");
       for(i = 0; i < 2; i++)
          if(test_flags[i] == 1)
          {
             double time_spent = (stop_time[i].tv_sec - start_time[i].tv_sec) + ((double) stop_time[i].tv_usec - start_time[i].tv_usec) / 1E6;
             printf("%s,%.6f\n", test_name[i], time_spent);
          }
    }

    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
示例#22
0
int main(int argc, char **argv) {
    int i, j, rank, nranks, peer, bufsize, errors;
    double **buffer, *src_buf;
    int count[2], src_stride, trg_stride, stride_level;

    MPI_Init(&argc, &argv);
    ARMCI_Init();

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    buffer = (double **) malloc(sizeof(double *) * nranks);

    bufsize = XDIM * YDIM * sizeof(double);
    ARMCI_Malloc((void **) buffer, bufsize);
    src_buf = ARMCI_Malloc_local(bufsize);

    if (rank == 0)
        printf("ARMCI Strided Put Test:\n");

    src_stride = XDIM * sizeof(double);
    trg_stride = XDIM * sizeof(double);
    stride_level = 1;

    count[1] = YDIM;
    count[0] = XDIM * sizeof(double);

    ARMCI_Barrier();

    peer = (rank+1) % nranks;

    for (i = 0; i < ITERATIONS; i++) {

      for (j = 0; j < XDIM*YDIM; j++) {
        *(src_buf + j) = rank + i;
      }

      ARMCI_PutS(
          src_buf,
          &src_stride,
          (void *) buffer[peer],
          &trg_stride,
          count,
          stride_level,
          peer);
    }

    ARMCI_Barrier();

    ARMCI_Access_begin(buffer[rank]);
    for (i = errors = 0; i < XDIM; i++) {
      for (j = 0; j < YDIM; j++) {
        const double actual   = *(buffer[rank] + i + j*XDIM);
        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS);
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);
          errors++;
          fflush(stdout);
        }
      }
    }
    ARMCI_Access_end(buffer[rank]);

    ARMCI_Free((void *) buffer[rank]);
    ARMCI_Free_local(src_buf);
    free(buffer);

    ARMCI_Finalize();
    MPI_Finalize();

    if (errors == 0) {
      printf("%d: Success\n", rank);
      return 0;
    } else {
      printf("%d: Fail\n", rank);
      return 1;
    }
}
示例#23
0
int main(int argc, char **argv)
{

    int i, j, rank, nranks, peer;
    size_t xdim, ydim;
    unsigned long bufsize;
    double **buffer, *src_buf;
    double t_start=0.0, t_stop;
    int count[2], src_stride, trg_stride, stride_level;
    double scaling;
    int provided;

    MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    if (nranks < 2) {
        printf("%s: Must be run with at least 2 processes\n", argv[0]);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    ARMCI_Init_args(&argc, &argv);

    buffer = (double **) malloc(sizeof(double *) * nranks);

    bufsize = MAX_XDIM * MAX_YDIM * sizeof(double);
    ARMCI_Malloc((void **) buffer, bufsize);
    src_buf = ARMCI_Malloc_local(bufsize);

    if (rank == 0)
    {
        printf("ARMCI_AccS Latency - local and remote completions - in usec \n");
        printf("%30s %22s %22s\n",
               "Dimensions(array of double)",
               "Local Completion",
               "Remote completion");
        fflush(stdout);
    }

    ARMCI_Access_begin(buffer[rank]);
    for (i = 0; i < bufsize / sizeof(double); i++)
    {
      *(buffer[rank] + i) = 1.0 + rank;
      *(src_buf + i) = 1.0 + rank;
    }
    ARMCI_Access_end(buffer[rank]);

    scaling = 2.0;

    src_stride = MAX_YDIM * sizeof(double);
    trg_stride = MAX_YDIM * sizeof(double);
    stride_level = 1;

    ARMCI_Barrier();

    for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2)
    {

        count[1] = xdim;

        for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2)
        {

            count[0] = ydim * sizeof(double);

            if (rank == 0)
            {

                peer = 1;

                for (i = 0; i < ITERATIONS + SKIP; i++)
                {

                    if (i == SKIP) t_start = MPI_Wtime();

                    ARMCI_AccS(ARMCI_ACC_DBL,
                               (void *) &scaling,
                               /* (void *) buffer[rank] */ src_buf,
                               &src_stride,
                               (void *) buffer[peer],
                               &trg_stride,
                               count,
                               stride_level,
                               1);

                }
                t_stop = MPI_Wtime();
                ARMCI_Fence(1);

                char temp[10];
                sprintf(temp, "%dX%d", (int) xdim, (int) ydim);
                printf("%30s %20.2f ", temp, ((t_stop - t_start) * 1000000)
                        / ITERATIONS);
                fflush(stdout);

                ARMCI_Barrier();

                ARMCI_Barrier();

                for (i = 0; i < ITERATIONS + SKIP; i++)
                {

                    if (i == SKIP) t_start = MPI_Wtime();

                    ARMCI_AccS(ARMCI_ACC_DBL,
                               (void *) &scaling,
                               /* (void *) buffer[rank] */ src_buf,
                               &src_stride,
                               (void *) buffer[peer],
                               &trg_stride,
                               count,
                               stride_level,
                               1);
                    ARMCI_Fence(1);

                }
                t_stop = MPI_Wtime();
                printf("%20.2f \n", ((t_stop - t_start) * 1000000) / ITERATIONS);
                fflush(stdout);

                ARMCI_Barrier();

                ARMCI_Barrier();

            }
            else
            {

                peer = 0;

                ARMCI_Barrier();

                if (rank == 1) 
                {
                  ARMCI_Access_begin(buffer[rank]);
                  for (i = 0; i < xdim; i++)
                  {
                    for (j = 0; j < ydim; j++)
                    {
                      if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank)
                            + scaling * (1.0 + peer) * (ITERATIONS + SKIP)))
                      {
                        printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
                            i,
                            j,
                            ((1.0 + rank) + scaling * (1.0 + peer)),
                            *(buffer[rank] + i * MAX_YDIM + j));
                        fflush(stdout);
                        ARMCI_Error("Bailing out", 1);
                      }
                    }
                  }

                  for (i = 0; i < bufsize / sizeof(double); i++)
                  {
                    *(buffer[rank] + i) = 1.0 + rank;
                  }
                  ARMCI_Access_end(buffer[rank]);
                }

                ARMCI_Barrier();

                ARMCI_Barrier();

                if (rank == 1) 
                {
                  ARMCI_Access_begin(buffer[rank]);

                  for (i = 0; i < xdim; i++)
                  {
                    for (j = 0; j < ydim; j++)
                    {
                      if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank)
                            + scaling * (1.0 + peer) * (ITERATIONS + SKIP)))
                      {
                        printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
                            i,
                            j,
                            ((1.0 + rank) + scaling * (1.0 + peer)),
                            *(buffer[rank] + i * MAX_YDIM + j));
                        fflush(stdout);
                        ARMCI_Error("Bailing out", 1);
                      }
                    }
                  }

                  for (i = 0; i < bufsize / sizeof(double); i++)
                  {
                    *(buffer[rank] + i) = 1.0 + rank;
                  }

                  ARMCI_Access_end(buffer[rank]);
                }
                ARMCI_Barrier();

            }

        }

    }

    ARMCI_Barrier();

    ARMCI_Free((void *) buffer[rank]);
    ARMCI_Free_local(src_buf);
    free(buffer);

    ARMCI_Finalize();

    MPI_Finalize();

    return 0;
}
示例#24
0
int main(int argc, char* argv[])
{
    int provided;
    int i, rank, nranks, msgsize, target;
    long bufsize;
    int **counter;
    int *complete;
    int increment;
    int counter_fetch;
    int counters_received;
    int t_start, t_stop, t_latency;
    int expected;

    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    ARMCI_Init_args(&argc, &argv);

    complete = (int *) malloc(sizeof(int) * COUNT);

    counter = (int**) ARMCI_Malloc_local( nranks * sizeof(int*) );
    ARMCI_Malloc((void *) counter[rank], sizeof(int));

    if (rank == 0)
    {
        printf("ARMCI_RMW Test - in usec \n");
        fflush(stdout);
    }

    target = 0; 

    for(i=0; i<COUNT; i++)
    {
       complete[i] = 0;
    } 
    if(rank == target) 
    { 
       *(counter[rank]) = 0;
    }
    increment = 1;
    counter_fetch = 0;
    counters_received = 0;

    MPI_Barrier(MPI_COMM_WORLD);
 
    while(counter_fetch < COUNT)
    {  
        ARMCI_Rmw(ARMCI_FETCH_AND_ADD,
                  (void *) &counter_fetch,
                  (void *) counter[target],
                  increment,
                  target);

        /* s/1/rank/ means we will know who got the counter */
        if (counter_fetch < COUNT) complete[counter_fetch] = rank;
        counters_received++;
    }

    MPI_Allreduce(MPI_IN_PLACE,complete,COUNT,MPI_INT,MPI_SUM,MPI_COMM_WORLD);

    for(i=0; i<COUNT; i++)
    {
       if (complete[i] == 0)
       {
           printf("[%d] The RMW update failed at index: %d \n", rank, i);
           fflush(stdout);
           exit(-1);
       }   
    }
    printf("[%d] The RMW update completed successfully \n", rank);
    fflush(stdout);
    MPI_Barrier(MPI_COMM_WORLD);

    if (0==rank)
    {
        printf("Checking for fairness...\n", rank);
        fflush(stdout);
        for(i=0; i<COUNT; i++)
        {
           printf("counter value %d was received by process %d\n", i, complete[i]);
        }
        fflush(stdout);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    printf("process %d received %d counters\n", rank, counters_received);
    fflush(stdout);

    ARMCI_Free(counter[rank]);
    ARMCI_Free_local(counter);

    ARMCI_Finalize();

    MPI_Finalize();

    return 0;
}
示例#25
0
int main(int argc, char **argv) {
  int                      me, nproc;
  int                      i, *procs;
  ARMCI_Group              g_world, g_odd, g_even;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  procs = malloc(sizeof(int) * ( nproc/2 + (nproc % 2 ? 1 : 0 )));

  if (me == 0) printf("ARMCI Group test starting on %d procs\n", nproc);

  ARMCI_Group_get_world(&g_world);
  
  if (me == 0) printf(" + Creating odd group\n");

  for (i = 1; i < nproc; i += 2) {
    procs[i/2] = i;
  }

  ARMCI_Group_create_child(i/2, procs, &g_odd, &g_world);

  if (me == 0) printf(" + Creating even group\n");

  for (i = 0; i < nproc; i += 2) {
    procs[i/2] = i;
  }

  ARMCI_Group_create_child(i/2, procs, &g_even, &g_world);

  /***********************************************************************/
  {
    int    grp_me, grp_nproc;
    double t_abs_to_grp, t_grp_to_abs;
    const int iter = 1000000;

    if (me == 0) {
      ARMCI_Group_rank(&g_even, &grp_me);
      ARMCI_Group_size(&g_even, &grp_nproc);

      t_abs_to_grp = MPI_Wtime();

      for (i = 0; i < iter; i++)
        ARMCII_Translate_absolute_to_group(&g_even, (grp_me+1) % grp_nproc);

      t_abs_to_grp = MPI_Wtime() - t_abs_to_grp;

      t_grp_to_abs = MPI_Wtime();

      for (i = 0; i < iter; i++)
        ARMCI_Absolute_id(&g_even, (grp_me+1) % grp_nproc);

      t_grp_to_abs = MPI_Wtime() - t_grp_to_abs;

      printf("t_abs_to_grp = %f us, t_grp_to_abs = %f us\n", t_abs_to_grp/iter * 1.0e6, t_grp_to_abs/iter * 1.0e6);
    }

    ARMCI_Barrier();
  }
  /***********************************************************************/

  if (me == 0) printf(" + Freeing groups\n");

  if (me % 2 > 0)
    ARMCI_Group_free(&g_odd);
  else
    ARMCI_Group_free(&g_even);

  free(procs);

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
示例#26
0
int main(int argc, char *argv[])
{
    int i, j;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    
    /* ARMCI */
    void **ptr;
    double **ptr_loc;
    
    MP_INIT(argc,argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);
    
    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
            case 'n': n = atoi(optarg); break;
            case 'b': block_size = atoi(optarg); break;
            case 'p': nproc = atoi(optarg); break;
            case 'h': {
                printf("Usage: LU, or \n");
        printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
                MP_BARRIER();
                MP_FINALIZE();
                exit(0);
            }            
        }
    }
    
    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }
    
/*      num_rows = (int) sqrt((double) nproc); */
/*      for (;;) { */
/*          num_cols = nproc/num_rows; */
/*          if (num_rows*num_cols == nproc) */
/*              break; */
/*          num_rows--; */
/*      } */
    
    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }

    nnodes = nproc / 4;
    if((nnodes * 4) != nproc) {
        num_cols = nproc - nnodes * 4;
        nnodes++;
        num_rows = 1;
    }
    else {
        num_cols = 2;
        num_rows = 2;
    }    
    
    num = (nblocks * nblocks)/nnodes;
    if((num * nnodes) != (nblocks * nblocks))
        num++;

#ifdef DEBUG
    if(me == 0)
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) 
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    MP_BARRIER();
    MP_FINALIZE();
    exit(0);
#endif
    
    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }
    
    for (i=0;i<nblocks;i++) {
        for (j=0;j<nblocks;j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }
    
    /* initialize ARMCI */
    ARMCI_Init();
    ptr = (void **)malloc(nproc * sizeof(void *));
    ARMCI_Malloc(ptr, proc_bytes);
    
    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    } 
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks;i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }
    
    /* initialize the array */
    init_array();
    
    /* barrier to ensure all initialization is done */
    MP_BARRIER();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    MP_BARRIER();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me); 
        }
        MP_BARRIER();
    }
    

    /* Starting the timer */
    if(me == 0) start_timer();

    lu(n, block_size, me);
    
    MP_BARRIER();

    /* Timer Stops here */
    if(me == 0) 
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time());

    if(doprint) {        
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        MP_BARRIER();
    }
    
    /* done */
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
    MP_FINALIZE();

    return 0;
}
示例#27
0
文件: matmul.c 项目: arnolda/scafacos
int main(int argc, char **argv)
{
	int me,nproc;
    int status;
    int rank;

    /* initialization */
    MPI_Init(&argc, &argv);
    ARMCI_Init();

#ifdef HPC_PROFILING
    HPM_Init();
#endif

    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    MPI_Comm_size(MPI_COMM_WORLD,&nproc);

#ifdef DEBUG
    if(me == 0){
       printf("The result of MPI_Comm_size is %d\n",nproc);
       fflush(stdout);
    }
#endif

    /* get the matrix parameters */
    if (argc > 1){
        rank = atoi(argv[1]);
    } else {
        rank = 8;
    }
    if (me == 0){
        printf("Running matmul.x with rank = %d\n",rank);
        fflush(stdout);
    }

    /* register remote pointers */
    double** addr_A = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc);
    if (addr_A == NULL) ARMCI_Error("malloc A failed at line",0);

    double** addr_B = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc);
    if (addr_B == NULL) ARMCI_Error("malloc B failed at line",0);

    double** addr_C = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc);
    if (addr_C == NULL) ARMCI_Error("malloc C failed at line",0);

#ifdef DEBUG
    if(me == 0) printf("ARMCI_Malloc A requests %lu bytes\n",rank*rank*sizeof(double));
    fflush(stdout);
#endif
    status = ARMCI_Malloc((void **) addr_A, rank*rank*sizeof(double));
    if (status != 0) ARMCI_Error("ARMCI_Malloc A failed",status);

#ifdef DEBUG
    if(me == 0) printf("ARMCI_Malloc B requests %lu bytes\n",rank*rank*sizeof(double));
    fflush(stdout);
#endif
    status = ARMCI_Malloc((void **) addr_B, rank*rank*sizeof(double));
    if (status != 0) ARMCI_Error("ARMCI_Malloc B failed",status);

#ifdef DEBUG
    if(me == 0) printf("ARMCI_Malloc C requests %lu bytes\n",rank*rank*sizeof(double));
    fflush(stdout);
#endif
    status = ARMCI_Malloc((void **) addr_C, rank*rank*sizeof(double));
    if (status != 0) ARMCI_Error("ARMCI_Malloc C failed",status);

    MPI_Barrier(MPI_COMM_WORLD);

    /* free ARMCI pointers */
    ARMCI_Free_local(addr_C);
    ARMCI_Free_local(addr_B);
    ARMCI_Free_local(addr_A);

#ifdef HPC_PROFILING
    HPM_Print();
#endif

    /* the end */
    ARMCI_Finalize();
    MPI_Finalize();

    return(0);
}
示例#28
0
文件: lu.c 项目: dmlb2000/nwchem-cml
main(int argc, char *argv[])
{
    int i, j;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    int nloop=5;
    double **ptr_loc;

    MP_INIT(arc,argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
        case 'n':
            n = atoi(optarg);
            break;
        case 'b':
            block_size = atoi(optarg);
            break;
        case 'p':
            nproc = atoi(optarg);
            break;
        case 'h': {
            printf("Usage: LU, or \n");
            printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
            MP_BARRIER();
            MP_FINALIZE();
            exit(0);
        }
        }
    }

    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }

    num_rows = (int) sqrt((double) nproc);
    for (;;) {
        num_cols = nproc/num_rows;
        if (num_rows*num_cols == nproc)
            break;
        num_rows--;
    }

    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }

    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }

#ifdef DEBUG
    if(me == 0)
        for (i=0; i<nblocks; i++) {
            for (j=0; j<nblocks; j++)
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    MP_BARRIER();
    MP_FINALIZE();
    exit(0);
#endif

    for (i=0; i<nblocks; i++) {
        for (j=0; j<nblocks; j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }

    ptr = (void **)malloc(nproc * sizeof(void *));
#ifdef MPI2_ONESIDED
    MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]);
    MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL,
                   MPI_COMM_WORLD, &win);
    for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me];
    MPI_Barrier(MPI_COMM_WORLD);

#else
    /* initialize ARMCI */
    ARMCI_Init();
    ARMCI_Malloc(ptr, proc_bytes);
#endif

    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    }
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks; i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }

    /* initialize the array */
    init_array();

    /* barrier to ensure all initialization is done */
    MP_BARRIER();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    MP_BARRIER();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me);
        }
        MP_BARRIER();
    }

    lu(n, block_size, me); /* cold start */

    /* Starting the timer */

    MP_BARRIER();
    if(me == 0) start_timer();
    for(i=0; i<nloop; i++) lu(n, block_size, me);
    MP_BARRIER();

    /* Timer Stops here */
    if(me == 0)
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time()/nloop);
    printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop);

    if(doprint) {
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        MP_BARRIER();
    }

    /* done */
#ifdef MPI2_ONESIDED
    MPI_Win_free(&win);
    MPI_Free_mem(ptr[me]);
#else
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
#endif
    MP_FINALIZE();
}
示例#29
0
int main(int argc, char *argv[])
{
    int ch;
    extern char *optarg;
    int i, j, r;
    thread_t threads[MAX_TPP];

    /* init MP */
    MP_INIT(argc,argv);
    MP_PROCS(&size);
    MP_MYID(&rank);

    while ((ch = getopt(argc, argv, "t:s:i:d:h")) != -1) {
        switch(ch) {
            case 't': /* # of threads */
                tpp = atoi(optarg);
                if (tpp < 1 || tpp > MAX_TPP) {
                    PRINTF0("\"%s\" is improper value for -t, should be a "
                            "number between 1 and %d(MAX_TPP)\n",
                            optarg, MAX_TPP);
                    usage();
                }
                break;
            case 'i': /* # of iterations */
                iters = atoi(optarg);
                if (iters < 1) {
                    PRINTF0("\"%s\" is improper value for -t, should be a "
                            "number equal or larger than 1\n", optarg);
                    usage();
                }
                break;
            case 's': /* # of elements in the array */
                asize = atoi(optarg);
                if (iters < 1) {
                    PRINTF0("\"%s\" is improper value for -s, should be a "
                            "number equal or larger than 1\n", optarg);
                    usage();
                }
                break;
            case 'd': delay = atoi(optarg); break; /* delay before start */
            case 'h': usage(); break; /* print usage info */
        }
    }
#ifdef NOTHREADS
    tpp = 1;
    PRINTF0("Warning: NOTHREADS debug symbol is set -- running w/o threads\n");
#endif
    th_size = size * tpp;
    PRINTF0("\nTest of multi-threaded capabilities:\n"
            "%d threads per process (%d threads total),\n"
            "%d array elements of size %d,\n"
            "%d iteration(s)\n\n", tpp, th_size, asize, sizeof(atype_t), iters);
    if (delay) {
        printf("%d: %d\n", rank, getpid());
        fflush(stdout);
        sleep(delay);
        MP_BARRIER();
    }
    TH_INIT(size,tpp);
    for (i = 0; i < tpp; i++) th_rank[i] = rank * tpp + i;

#if defined(DEBUG) && defined(LOG2FILE)
    for (i = 0; i < tpp; i++) {
        fname[10] = '0' + th_rank[i] / 100;
        fname[11] = '0' + th_rank[i] % 100 / 10;
        fname[12] = '0' + th_rank[i] % 10;
        dbg[i] = fopen(fname, "w");
    }
#endif
    for (i = 0; i < tpp; i++)
        prndbg(i, "proc %d, thread %d(%d):\n", rank, i, th_rank[i]);

    /* init ARMCI */
    ARMCI_Init();

    /* set global seed (to ensure same random sequence across procs) */
    time_seed = (unsigned)time(NULL);
    armci_msg_brdcst(&time_seed, sizeof(time_seed), 0);
    srand(time_seed); rand();
    prndbg(0, "seed = %u\n", time_seed);
    /* random pairs */
    pairs = calloc(th_size, sizeof(int));
    for (i = 0; i < th_size; i++) pairs[i] = -1;
    for (i = 0; i < th_size; i++) {
        if (pairs[i] != -1) continue;
        r = RND(0, th_size);
        while (i == r || pairs[r] != -1 ) r = RND(0, th_size);
        pairs[i] = r; pairs[r] = i;
    }
    for (i = 0, cbufl = 0; i < th_size; i++)
        cbufl += sprintf(cbuf + cbufl, " %d->%d|%d->%d",
                         i, pairs[i], pairs[i], pairs[pairs[i]]);
    prndbg(0, "random pairs:%s\n", cbuf);
    /* random targets */
    rnd_tgts = calloc(th_size, sizeof(int));
    for (i = 0, cbufl = 0; i < th_size; i++) {
        rnd_tgts[i] = RND(0, th_size);
        if (rnd_tgts[i] == i) { i--; continue; }
        cbufl += sprintf(cbuf + cbufl, " %d", rnd_tgts[i]);
    }
    prndbg(0, "random targets:%s\n", cbuf);
    /* random one */
    rnd_one = RND(0, th_size);
    prndbg(0, "random one = %d\n", rnd_one);

    assert(ptrs1 = calloc(th_size, sizeof(void *)));
    assert(ptrs2 = calloc(th_size, sizeof(void *)));
#ifdef NOTHREADS
    thread_main((void *)(long)0);
#else
    for (i = 0; i < tpp; i++) THREAD_CREATE(threads + i, thread_main, (void *)(long)i);
    for (i = 0; i < tpp; i++) THREAD_JOIN(threads[i], NULL);
#endif

    MP_BARRIER();
    PRINTF0("Tests Completed\n");

    /* clean up */
#if defined(DEBUG) && defined(LOG2FILE)
    for (i = 0; i < tpp; i++) fclose(dbg[i]);
#endif
    ARMCI_Finalize();
    TH_FINALIZE();
    MP_FINALIZE();

	return 0;
}
示例#30
0
文件: simple.c 项目: v4m4/armci-mpi
int main(int argc, char **argv) {
  int i;
  double **myptrs;
  double t0, t1, tnbget=0, tnbwait=0, t2=0;

  MP_INIT(argc,argv);
  ARMCI_Init();

  MP_PROCS(&nprocs);
  MP_MYID(&me);

  if (nprocs < 2)
    ARMCI_Error("This program requires at least to processes", 1);

  myptrs = (double **)malloc(sizeof(double *)*nprocs);
  ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); 
  
  MP_BARRIER();
  
  if(me == 0) {
    for(i = 0; i < 10; i++) {
      // This is a bug:
      // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1);
      ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1);
    }

    t0 = MP_TIMER(); 
    for(i = 0; i < LOOP; i++) {
      // This is a bug:
      // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1);
      ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1);
    }
    t1 = MP_TIMER(); 

    printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP);
    fflush(stdout);

    t1 = t0 = 0;

    for(i = 0; i < LOOP; i++) {
      armci_hdl_t nbh;
      ARMCI_INIT_HANDLE(&nbh);

      t0 = MP_TIMER(); 
      //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh);
      ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh);
      t1 = MP_TIMER(); 
      ARMCI_Wait(&nbh);
      t2 = MP_TIMER();

      tnbget  += (t1-t0);
      tnbwait += (t2-t1);
    }

    printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout);
  }

  else
    sleep(1);

  MP_BARRIER();

  ARMCI_Finalize();
  MP_FINALIZE();

  return 0;
}