示例#1
0
文件: simple.c 项目: arnolda/scafacos
int main(int argc, char **argv)
{
int k,i;
double **myptrs[10];
double t0,t1,tget=0,tnbget=0,tput=0,tnbput=0,tnbwait=0,t2=0;
#if PORTALS
    ARMCI_NetInit();
#endif
    MPI_Init(&argc,&argv);
    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
    ARMCI_Init();
    ARMCI_Init();
    for(k=0;k<10;k++){
      myptrs[k] = (double **)malloc(sizeof(double *)*nprocs);
      ARMCI_Malloc((void **)myptrs[k],400000*LOOP*sizeof(double)); 
      for(i=0;i<LOOP;i++)myptrs[k][me][i]=me+0.414;
      MPI_Barrier(MPI_COMM_WORLD);
      for(i=0;i<LOOP;i++){
        ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs);
        /*if(myptrs[k][me][i]!=0.414+(me+1)%nprocs)ARMCI_Error("errr",myptrs[k][me][i]);*/
      }
      t0=t1=tget=tnbget=tput=tnbput=tnbwait=t2=0;
      t0 = MPI_Wtime(); 
      for(i=0;i<LOOP;i++){
        ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs);
      }
      t1 = MPI_Wtime(); 
      printf("\nGet Latency=%lf\n",1e6*(t1-t0)/LOOP);fflush(stdout);
      t1=t0=0;
      for(i=0;i<LOOP;i++){
        armci_hdl_t nbh;
        ARMCI_INIT_HANDLE(&nbh);
        t0 = MPI_Wtime(); 
        ARMCI_NbGet(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs,&nbh);
        t1 = MPI_Wtime(); 
        ARMCI_Wait(&nbh);
        t2 = MPI_Wtime();
        tnbget+=(t1-t0);
        tnbwait+=(t2-t1);
      }
      printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout);
      MPI_Barrier(MPI_COMM_WORLD);
    }
    for(k=0;k<10;k++)ARMCI_Free(myptrs[k][me]);
    MPI_Barrier(MPI_COMM_WORLD);
    ARMCI_Finalize();
    ARMCI_Finalize();
    MPI_Finalize();
    
}
示例#2
0
文件: perf.c 项目: arnolda/scafacos
int main(int argc, char **argv)
{

    ARMCI_NetInit();
  MP_INIT(argc,argv);
  MP_MYID(&me);
  MP_PROCS(&nproc);

    if(nproc < 2 || nproc> MAXPROC) {
        if(me == 0)
            fprintf(stderr,
                    "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc);
        MP_BARRIER();
        MP_FINALIZE();
        exit(0);
    }
    
    /* initialize ARMCI */
    ARMCI_Init();

    if(!me)printf("\n             Performance of Basic Blocking Communication Operations\n");
    MP_BARRIER();
    
    CHECK_RESULT=1;
    if(!me)printf("\n\t\t\tContiguous Data Transfer\n");
    test_1D();
    CHECK_RESULT=0;

    /* test 1 dimension array */
    if(!me)printf("\n\t\t\tContiguous Data Transfer\n");
    test_1D();
    
    /* test 2 dimension array */
    if(!me)printf("\n\t\t\tStrided Data Transfer\n");
    test_2D();

    MP_BARRIER();
    if(me == 0){
       if(warn_accuracy) 
          printf("\nWARNING: Your timer does not have sufficient accuracy for this test (%d)\n",warn_accuracy);
       printf("\n\n------------ Testing the same data transfer for correctness ----------\n");
       fflush(stdout);
    }

    MP_BARRIER();
    CHECK_RESULT=1;
    if(!me)printf("\n\t\t\tContiguous Data Transfer\n");
    test_1D();
    if(me == 0) printf("OK\n");
    MP_BARRIER();
    if(!me)printf("\n\t\t\tStrided Data Transfer\n");
    test_2D();
    if(me == 0) printf("OK\n\n\nTests Completed.\n");
    MP_BARRIER();

    /* done */
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}    
示例#3
0
int main( int argc, char **argv)
{   
    
  MP_INIT(argc,argv);
  MP_MYID(&me);
  MP_PROCS(&nproc);
    
    if(nproc < 2) {
        if(me == 0)
            fprintf(stderr,
                    "USAGE: 2 <= processes < %d\n", nproc);
        MP_BARRIER();
        MP_FINALIZE();
        exit(0);
    }

    if(me == 0){
       printf("Test of ARMCI Wrappers to Basic Message Passing Operations\n");
       fflush(stdout);
    }
   
    /* initialize ARMCI */
    ARMCI_Init();

    MP_BARRIER();
   
    TestGlobals();
   
    /* done */
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}   
示例#4
0
int main(int argc, char* argv[])
{
int ndim;

    armci_msg_init(&argc, &argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();

    ARMCI_Init();

    armci_msg_barrier();
    if(me==0){
      printf("\nTesting armci_notify\n");
      fflush(stdout);
      sleep(1);
    }
    armci_msg_barrier();
        
    for(ndim=1; ndim<=MAXDIMS; ndim++) test_notify(ndim);
    armci_msg_barrier();

    ARMCI_Finalize();
    armci_msg_finalize();
    return(0);
}
示例#5
0
int main(int argc, char* argv[])
{
    int ndim;

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

    if(me==0){
       printf("ARMCI test program for lock(%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    test_lock();

    MP_BARRIER();
    if(me==0){printf("test passed\n"); fflush(stdout);}
    sleep(2);

    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
示例#6
0
int main(int argc, char **argv) {
    /* int heap=300000, stack=300000; */
    int me, nprocs;
    
    /* Step1: Initialize Message Passing library */
    armci_msg_init(&argc, &argv);

    /* Step2: Initialize ARMCI */
    ARMCI_Init();
    
    /* Step3: Initialize Memory Allocator (MA) */
    /*bjp
    if(! MA_init(C_DBL, stack, heap) ) ARMCI_Error("MA_init failed",stack+heap);
    */

    me     = armci_msg_me();
    nprocs = armci_msg_nproc();
    if(me==0) {
       printf("\nUsing %d processes\n\n", nprocs); fflush(stdout);
    }
    
       
    TRANSPOSE1D();
    
    if(me==0)printf("\nTerminating ..\n");
    ARMCI_Finalize();
    
    armci_msg_finalize();    
    return(0);
}
示例#7
0
int main(int argc, char ** argv) {
  int    rank, nproc, val, i;
  void **base_ptrs;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting ARMCI mutex read-modify-write test with %d processes\n", nproc);

  base_ptrs = malloc(nproc*sizeof(void*));

  ARMCI_Create_mutexes(rank == 0 ? 1 : 0);
  ARMCI_Malloc(base_ptrs, (rank == 0) ? sizeof(int) : 0); // Proc 0 has a shared int

  if (rank == 0) {
    val = 0;
    ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0);
  }

  ARMCI_Barrier();

  for (i = 0; i < NITER; i++) {
    ARMCI_Lock(0, 0);

    ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0);
    val += ADDIN;
    ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0);

    ARMCI_Unlock(0, 0);
  }

  printf(" + %3d done\n", rank);
  fflush(NULL);

  ARMCI_Barrier();

  if (rank == 0) {
    ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0);

    if (val == ADDIN*nproc*NITER)
      printf("Test complete: PASS.\n");
    else
      printf("Test complete: FAIL.  Got %d, expected %d.\n", val, ADDIN*nproc*NITER);
  }

  ARMCI_Free(base_ptrs[rank]);
  ARMCI_Destroy_mutexes();
  free(base_ptrs);

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
示例#8
0
int main(int argc, char ** argv) {
  MPI_Init(&argc, &argv);
  ARMCI_Init();

  ARMCI_Get(NULL, NULL, 1, 0);

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
示例#9
0
int main(int argc, char** argv)
{
      MPI_Init(&argc, &argv);
      MPI_Comm_size(MPI_COMM_WORLD, &nproc);
      MPI_Comm_rank(MPI_COMM_WORLD, &me);
      if(me==0)printf("Testing IPCs (%d MPI processes)\n\n",nproc);
      ARMCI_Init();
      test();
      ARMCI_Finalize();
      MPI_Finalize();
      return 0;

}
示例#10
0
int main(int argc, char ** argv) {
  int    rank, nproc, i;
  int   *buf;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting ARMCI GOP test with %d processes\n", nproc);

  buf = malloc(DATA_SZ*sizeof(int));

  if (rank == 0) printf(" - Testing ABSMIN\n");

  for (i = 0; i < DATA_SZ; i++)
    buf[i] = (rank+1) * ((i % 2) ? -1 : 1);

  armci_msg_igop(buf, DATA_SZ, "absmin");

  for (i = 0; i < DATA_SZ; i++)
    if (buf[i] != 1) {
      printf("Err: buf[%d] = %d expected 1\n", i, buf[i]);
      ARMCI_Error("Fail", 1);
    }

  if (rank == 0) printf(" - Testing ABSMAX\n");

  for (i = 0; i < DATA_SZ; i++)
    buf[i] = (rank+1) * ((i % 2) ? -1 : 1);

  armci_msg_igop(buf, DATA_SZ, "absmax");

  for (i = 0; i < DATA_SZ; i++)
    if (buf[i] != nproc) {
      printf("Err: buf[%d] = %d expected %d\n", i, buf[i], nproc);
      ARMCI_Error("Fail", 1);
    }

  free(buf);

  if (rank == 0) printf("Pass.\n");

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
示例#11
0
int main(int argc, char* argv[])
{

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

/*    printf("nproc = %d, me = %d\n", nproc, me);*/
    
    if( (nproc<MINPROC || nproc>MAXPROC) && me==0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\n Testing ARMCI Groups!\n\n");
      fflush(stdout);
    }

    test_groups();
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\n Collective groups: Success!!\n"); fflush(stdout);}
    sleep(2);

#ifdef ARMCI_GROUP
    test_groups_noncollective();

    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\n Non-collective groups: Success!!\n"); fflush(stdout);}
    sleep(2);
#endif
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
示例#12
0
int main(int argc, char* argv[])
{
ARMCI_NetInit();

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

    if(nproc < 2 || nproc> MAXPROC) {
      if(me == 0)
	fprintf(stderr,
		"USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc);
      MP_BARRIER();
      MP_FINALIZE();
      exit(0);
    }

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\n put/get/acc requests (Time in secs)\n\n");
      fflush(stdout);
    }

    test_perf_nb(1);
    test_perf_nb(0);
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}
    sleep(2);
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
示例#13
0
文件: put.c 项目: brog2610/quinoa
int main(int argc, char * argv[]) {
  void *baseAddress[MAX_PROCESSORS];
  char *local;
  int thisImage;

  int iter = 100, size;
  double startTime, endTime;
  int i;

  // initialize
  ARMCI_Init();
  ARMCI_Myid(&thisImage);

  // allocate data (collective operation)
  ARMCI_Malloc(baseAddress, MAX_BUF_SIZE*sizeof(char));
  local = (char *)ARMCI_Malloc_local(MAX_BUF_SIZE*sizeof(char));

  ARMCI_Barrier();
  ARMCI_Migrate();

  if (thisImage == 0) {
    for(size = 1; size <= MAX_BUF_SIZE; size = size<<1){
      startTime = CkWallTimer();
      for(i = 0; i < iter; i++){
        ARMCI_Put(local, baseAddress[1], size, 1);
      }
      ARMCI_Fence(1);
      endTime = CkWallTimer();
      printf("%d: %f us\n", size, (endTime-startTime)*1000);
    }
    ARMCI_Barrier();
  } else if (thisImage == 1) {
    ARMCI_Barrier();
  }

  
  ARMCI_Free(baseAddress[thisImage]);
  ARMCI_Free_local(local);
  // finalize
  ARMCI_Finalize();
  return 0;
}
示例#14
0
int main(int argc, char ** argv) {
  int     rank, nproc, test_iter;
  void ***base_ptrs;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting ARMCI memory allocation test with %d processes\n", nproc);

  base_ptrs = malloc(sizeof(void**)*NUM_ITERATIONS);

  // Perform a pile of allocations
  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
    if (rank == 0) printf(" + allocation %d\n", test_iter);

    base_ptrs[test_iter] = malloc(sizeof(void*)*nproc);
    ARMCI_Malloc((void**)base_ptrs[test_iter], (test_iter % 4 == 0) ? 0 : DATA_SZ);
  }

  ARMCI_Barrier();

  // Free all allocations
  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
    if (rank == 0) printf(" + free %d\n", test_iter);

    ARMCI_Free(((void**)base_ptrs[test_iter])[rank]);
    free(base_ptrs[test_iter]);
  }

  free(base_ptrs);

  if (rank == 0) printf("Test complete: PASS.\n");

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
int main(int argc, char* argv[])
{

    armci_msg_init(&argc, &argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();

    /*    printf("nproc = %d, me = %d\n", nproc, me);*/

    if(nproc>MAXPROC && me==0)
        ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0) {
        printf("ARMCI test program (%d processes)\n",nproc);
        fflush(stdout);
        sleep(1);
    }

    ARMCI_Init();

    if(me==0) {
        printf("\n  Performing Sparse Matrix-Vector Multiplication ...\n\n");
        fflush(stdout);
    }
    test_sparse();

    ARMCI_AllFence();
    armci_msg_barrier();
    if(me==0) {
        printf("\nSuccess!!\n");
        fflush(stdout);
    }
    sleep(2);

    armci_msg_barrier();
    ARMCI_Finalize();
    armci_msg_finalize();
    return(0);
}
示例#16
0
int main(int argc, char* argv[])
{

    MP_INIT(argc, argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

/*    printf("nproc = %d, me = %d\n", nproc, me);*/
    
    if(nproc>MAXPROC && me==0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0){
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }
    
    ARMCI_Init();

    if(me==0){
      printf("\nAggregate put/get requests\n\n");
      fflush(stdout);
    }
    test_aggregate(1); /* cold start */
    test_aggregate(0); /* warm start */
    
    ARMCI_AllFence();
    MP_BARRIER();
    if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}
    sleep(2);
	
    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
示例#17
0
main(int argc, char *argv[])
{
    int i, j, l;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    int lu_arg[MAX_THREADS][3];
    
    /* ARMCI */
    void **ptr;
    double **ptr_loc;

    THREAD_LOCK_INIT(mutex);
    
    armci_msg_init(&argc,&argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();
    
    while ((ch = getopt(argc, argv, "n:b:p:t:d:h")) != -1) {
        switch(ch) {
            case 'n': n = atoi(optarg); break;
            case 'b': block_size = atoi(optarg); break;
            case 'p': nproc = atoi(optarg); break;
            case 't': th_per_p = atoi(optarg); break;
            case 'd': d = atoi(optarg); break;
            case 'h': {
                printf("Usage: LU, or \n");
        printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC -tTH_PER_P\n");
                armci_msg_barrier();
                armci_msg_finalize();
                exit(0);
            } 
        }
    }

    if(th_per_p>MAX_THREADS) {
        th_per_p=MAX_THREADS;
        if(me==0)printf("Warning: cannot run more than %d threads, adjust MAX_THREADS",MAX_THREADS);
    }

    if (d) {
        fprintf(stderr, "%d: %d\n", me, getpid());
        sleep(d);
    }

    nthreads = th_per_p * nproc;
    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d thread(s) per processor, %d threads total\n", th_per_p, nthreads);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }
    
    num_rows = (int) sqrt((double) nthreads);
    for (;;) {
        num_cols = nthreads/num_rows;
        if (num_rows*num_cols == nthreads)
            break;
        num_rows--;
    }
    
    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }
    
    num = (nblocks * nblocks)/nthreads;
    if((num * nthreads) != (nblocks * nblocks))
        num++;

    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }
#ifdef DEBUG
    if(me == 0)
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) 
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    armci_msg_barrier();
/*    armci_msg_finalize(); */
/*    exit(0); */
#endif
    
    for (l = 0; l < th_per_p; l++) {
        me_th[l] = me * th_per_p + l;
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) {
                if(block_owner(i,j) == me_th[l]) {
                    if ((i == nblocks-1) && (j == nblocks-1)) {
                        size = edge*edge;
                    }
                    else if ((i == nblocks-1) || (j == nblocks-1)) {
                        size = edge*block_size;
                    }
                    else {
                        size = block_size*block_size;
                    }
                    thread_doubles[l] += size;
                }
            }
        }
        proc_bytes += thread_doubles[l] * sizeof(double);
    }

    /* initialize ARMCI */
    ARMCI_Init();
    ptr = (void **)malloc(nproc * sizeof(void *));
    ARMCI_Malloc(ptr, proc_bytes);

    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    }
    ptr_loc = (double **)malloc(nthreads*sizeof(double *));
    for (i = 0; i < nproc; i++) {
        ptr_loc[i * th_per_p] = (double *)ptr[i];
        for (j = 1; j < th_per_p; j++)
            ptr_loc[i * th_per_p + j] = ptr_loc[i * th_per_p + j - 1] + thread_doubles[j - 1];
    }
    for(i=0; i<nblocks;i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }
#if 0
    for(i=0; i<nblocks*nblocks;i ++) printf("%d: a[%d]=%p\n", me, i, a[i]);
    fflush(stdout);
#endif
    
    /* initialize the array */
    init_array();
    
    /* barrier to ensure all initialization is done */
    armci_msg_barrier();

    /* to remove cold-start misses, all processors touch their own data */
/*    for (l = 0; l < th_per_p; l++) touch_array(block_size, me_th[l]); */
    armci_msg_barrier();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me); 
        }
        armci_msg_barrier();
    }

#if 1
    for (i = 0; i < nblocks; i++)
        for (j = 0; j < nblocks; j++)
            print_block_dbg(a[i + j * nblocks], "proc %d, a[%d, %d]:\n", me, i, j);
#endif

    TH_INIT(nproc,th_per_p);

    /* Starting the timer */
    if(me == 0) start_timer();

    for (l = 0; l < th_per_p; l++) {
        lu_arg[l][0] = n;
        lu_arg[l][1] = block_size;
        lu_arg[l][2] = l;
        THREAD_CREATE(threads + l, lu, lu_arg[l]);
    }
    
    for (l = 0; l < th_per_p; l++) THREAD_JOIN(threads[l], NULL);
    armci_msg_barrier();

    /* Timer Stops here */
    if(me == 0) 
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time());

    if(doprint) {        
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        armci_msg_barrier();
    }
    
    /* done */
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
    armci_msg_finalize();

    THREAD_LOCK_DESTROY(mutex);
}
示例#18
0
文件: lu.c 项目: dmlb2000/nwchem-cml
main(int argc, char *argv[])
{
    int i, j;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    int nloop=5;
    double **ptr_loc;

    MP_INIT(arc,argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
        case 'n':
            n = atoi(optarg);
            break;
        case 'b':
            block_size = atoi(optarg);
            break;
        case 'p':
            nproc = atoi(optarg);
            break;
        case 'h': {
            printf("Usage: LU, or \n");
            printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
            MP_BARRIER();
            MP_FINALIZE();
            exit(0);
        }
        }
    }

    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }

    num_rows = (int) sqrt((double) nproc);
    for (;;) {
        num_cols = nproc/num_rows;
        if (num_rows*num_cols == nproc)
            break;
        num_rows--;
    }

    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }

    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }

#ifdef DEBUG
    if(me == 0)
        for (i=0; i<nblocks; i++) {
            for (j=0; j<nblocks; j++)
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    MP_BARRIER();
    MP_FINALIZE();
    exit(0);
#endif

    for (i=0; i<nblocks; i++) {
        for (j=0; j<nblocks; j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }

    ptr = (void **)malloc(nproc * sizeof(void *));
#ifdef MPI2_ONESIDED
    MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]);
    MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL,
                   MPI_COMM_WORLD, &win);
    for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me];
    MPI_Barrier(MPI_COMM_WORLD);

#else
    /* initialize ARMCI */
    ARMCI_Init();
    ARMCI_Malloc(ptr, proc_bytes);
#endif

    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    }
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks; i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }

    /* initialize the array */
    init_array();

    /* barrier to ensure all initialization is done */
    MP_BARRIER();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    MP_BARRIER();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me);
        }
        MP_BARRIER();
    }

    lu(n, block_size, me); /* cold start */

    /* Starting the timer */

    MP_BARRIER();
    if(me == 0) start_timer();
    for(i=0; i<nloop; i++) lu(n, block_size, me);
    MP_BARRIER();

    /* Timer Stops here */
    if(me == 0)
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time()/nloop);
    printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop);

    if(doprint) {
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        MP_BARRIER();
    }

    /* done */
#ifdef MPI2_ONESIDED
    MPI_Win_free(&win);
    MPI_Free_mem(ptr[me]);
#else
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
#endif
    MP_FINALIZE();
}
示例#19
0
int main(int argc, char **argv) {
    int i, j, rank, nranks, peer, bufsize, errors, total_errors;
    double **buf_bvec, **src_bvec, *src_buf;
    int count[2], src_stride, trg_stride, stride_level;
    double scaling, time;

    MPI_Init(&argc, &argv);
    ARMCI_Init();

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    buf_bvec = (double **) malloc(sizeof(double *) * nranks);
    src_bvec = (double **) malloc(sizeof(double *) * nranks);

    bufsize = XDIM * YDIM * sizeof(double);
    ARMCI_Malloc((void **) buf_bvec, bufsize);
    ARMCI_Malloc((void **) src_bvec, bufsize);
    src_buf = src_bvec[rank];

    if (rank == 0)
        printf("ARMCI Strided DLA Accumulate Test:\n");

    ARMCI_Access_begin(buf_bvec[rank]);
    ARMCI_Access_begin(src_buf);

    for (i = 0; i < XDIM*YDIM; i++) {
        *(buf_bvec[rank] + i) = 1.0 + rank;
        *(src_buf + i) = 1.0 + rank;
    }

    ARMCI_Access_end(src_buf);
    ARMCI_Access_end(buf_bvec[rank]);

    scaling = 2.0;

    src_stride = XDIM * sizeof(double);
    trg_stride = XDIM * sizeof(double);
    stride_level = 1;

    count[1] = YDIM;
    count[0] = XDIM * sizeof(double);

    ARMCI_Barrier();
    time = MPI_Wtime();

    peer = (rank+1) % nranks;

    for (i = 0; i < ITERATIONS; i++) {

      ARMCI_AccS(ARMCI_ACC_DBL,
          (void *) &scaling,
          src_buf,
          &src_stride,
          (void *) buf_bvec[peer],
          &trg_stride,
          count,
          stride_level,
          peer);
    }

    ARMCI_Barrier();
    time = MPI_Wtime() - time;

    if (rank == 0) printf("Time: %f sec\n", time);

    ARMCI_Access_begin(buf_bvec[rank]);
    for (i = errors = 0; i < XDIM; i++) {
      for (j = 0; j < YDIM; j++) {
        const double actual   = *(buf_bvec[rank] + i + j*XDIM);
        const double expected = (1.0 + rank) + scaling * (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);
          errors++;
          fflush(stdout);
        }
      }
    }
    ARMCI_Access_end(buf_bvec[rank]);

    MPI_Allreduce(&errors, &total_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

    ARMCI_Free((void *) buf_bvec[rank]);
    ARMCI_Free((void *) src_bvec[rank]);

    free(buf_bvec);
    free(src_bvec);

    ARMCI_Finalize();
    MPI_Finalize();

    if (total_errors == 0) {
      if (rank == 0) printf("Success.\n");
      return 0;
    } else {
      if (rank == 0) printf("Fail.\n");
      return 1;
    }
}
示例#20
0
文件: test2.c 项目: arnolda/scafacos
int main(int argc, char* argv[])
{
    int i;
    struct timeval start_time[14];
    struct timeval stop_time[14];
    /*
      char * test_name[14] = {
      "dim", "nbdim", "vec_small", "acc",
      "vector", "vector_acc", "fetch_add",
      "swap", "rput", "aggregate", "implicit",
      "memlock", "acc_type", "collective"
      };
      int test_flags[14] = {
      1, 1, 1, 1,
      1, 1, 1,
      1, 1, 0, 1,
      1, 1, 1
      };
    */
    char * test_name[2] = { "acc_type", "collective" };
    int test_flags[2]   = { 1, 1 };

#define TEST_ACC_TYPE   0
#define TEST_COLLECTIVE 1

    MP_INIT(argc, argv);
    ARMCI_Init();
    MP_PROCS(&nproc);
    MP_MYID(&me);

    if(nproc > MAXPROC && me == 0)
       ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me == 0)
    {
       printf("ARMCI test program (%d processes)\n",nproc); 
       fflush(stdout);
       sleep(1);
    }    

    gettimeofday(&start_time[TEST_ACC_TYPE],NULL);
    if(test_flags[TEST_ACC_TYPE] == 1)
    {
       if(me == 0)
       {
          printf("\nTesting Accumulate Types\n");
          fflush(stdout);
       }
       
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_INT\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_INT);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_LNG\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_LNG);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_FLT\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_FLT);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_DBL\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_DBL);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_CPL\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_CPL);
       ARMCI_AllFence();
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Accumulate ARMCI_ACC_DCP\n");
          fflush(stdout);
       }
       test_acc_type(ARMCI_ACC_DCP);
       ARMCI_AllFence();
       MP_BARRIER();
    }
    gettimeofday(&stop_time[TEST_ACC_TYPE],NULL);

    gettimeofday(&start_time[TEST_COLLECTIVE],NULL);
    if(test_flags[TEST_COLLECTIVE] == 1)
    {
       if(me == 0)
       {
          printf("\nTesting Collective Types\n");
          fflush(stdout);
       }
       if(me == 0)
       {
          printf("Test Collective ARMCI_INT\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_INT);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_LONG\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_LONG);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_FLOAT\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_FLOAT);
       MP_BARRIER();
       if(me == 0)
       {
          printf("Test Collective ARMCI_DOUBLE\n");
          fflush(stdout);
       }
       MP_BARRIER();
       test_collective(ARMCI_DOUBLE);
       MP_BARRIER();
    }
    gettimeofday(&stop_time[TEST_COLLECTIVE],NULL);
    
    if(me == 0)
    {
       printf("Accumulate and Collective tests passed\n");
       fflush(stdout);
    }

    if(me == 0)
    {
       printf("Testcase runtime\n");
       printf("Name,Time(seconds)\n");
       for(i = 0; i < 2; i++)
          if(test_flags[i] == 1)
          {
             double time_spent = (stop_time[i].tv_sec - start_time[i].tv_sec) + ((double) stop_time[i].tv_usec - start_time[i].tv_usec) / 1E6;
             printf("%s,%.6f\n", test_name[i], time_spent);
          }
    }

    MP_BARRIER();
    ARMCI_Finalize();
    MP_FINALIZE();
    return(0);
}
示例#21
0
main(int argc, char *argv[])
{
  int i, j;
  int ch;
  extern char *optarg;
  int edge;
  int size;
    
  /* ARMCI */
  void **ptr;
  double **ptr_loc;
  void **bufr_g, **bufc_g;

  MP_INIT(arc,argv);
  MP_PROCS(&nproc);
  MP_MYID(&me);
    
  while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
    switch(ch) {
    case 'n': n = atoi(optarg); break;
    case 'b': block_size = atoi(optarg); break;
    case 'p': nproc = atoi(optarg); break;
    case 'h': {
      printf("Usage: LU, or \n");
      printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
      MP_BARRIER();
      MP_FINALIZE();
      exit(0);
    }            
    }
  }
    
  if(me == 0) {
    printf("\nUsing pre-PUTing\n");
    printf("\n Blocked Dense LU Factorization\n");
    printf("     %d by %d Matrix\n", n, n);
    printf("     %d Processors\n", nproc);
    printf("     %d by %d Element Blocks\n", block_size, block_size);
    printf("\n");
  }
    
  num_rows = (int) sqrt((double) nproc);
  for (;;) {
    num_cols = nproc/num_rows;
    if (num_rows*num_cols == nproc)
      break;
    num_rows--;
  }
    
  nblocks = n/block_size;
  if (block_size * nblocks != n) {
    nblocks++;
  }
    
  edge = n%block_size;
  if (edge == 0) {
    edge = block_size;
  }
    
  #ifdef DEBUG
  if(me == 0)
    for (i=0;i<nblocks;i++) {
      for (j=0;j<nblocks;j++) 
	printf("%d ", block_owner(i, j));
      printf("\n");
    }
  MP_BARRIER();
  MP_FINALIZE();
  exit(0);
  #endif
    
  for (i=0;i<nblocks;i++) {
    for (j=0;j<nblocks;j++) {
      if(block_owner(i,j) == me) {
	if ((i == nblocks-1) && (j == nblocks-1)) {
	  size = edge*edge;
	}
	else if ((i == nblocks-1) || (j == nblocks-1)) {
	  size = edge*block_size;
	}
	else {
	  size = block_size*block_size;
	}
	proc_bytes += size*sizeof(double);
      }
    }
  }
    
  /* initialize ARMCI */
  ARMCI_Init();
  ptr = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(ptr, proc_bytes);
  
  a = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *));
  if (a == NULL) {
    fprintf(stderr, "Could not malloc memory for a\n");
    exit(-1);
  } 
  ptr_loc = (double **)ARMCI_Malloc_local(nproc*sizeof(double *));
  for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
  for(i=0; i<nblocks;i ++) {
    for(j=0; j<nblocks; j++) {
      a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
      if ((i == nblocks-1) && (j == nblocks-1)) {
	size = edge*edge;
      } else if ((i == nblocks-1) || (j == nblocks-1)) {
	size = edge*block_size;
      } else {
	size = block_size*block_size;
      }
      ptr_loc[block_owner(i, j)] += size;
    }
  }
    
  /* initialize the array */
  init_array();
  
  bufr = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *));
  bufc = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *));

  if (bufr == NULL || bufc == NULL)
    printf("Could not ARMCI_Malloc_local() mem\n");
  /* bufr points to all k-th row blocks */
  /* save all block address in row-major order */
  proc_bytes = nblocks*block_size*block_size * sizeof(double);
  bufr_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(bufr_g, proc_bytes);

  for (i = 0; i < nproc; i++) {
    bufr[i*nblocks] = (double *) bufr_g[i];
    for (j = 1; j < nblocks; j++) {
      bufr[i*nblocks + j]  = bufr[i*nblocks + j-1] + block_size * block_size;
    }
  }

  /* bufc points to all k-th column blocks */
  bufc_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(bufc_g, proc_bytes);

  for (i = 0; i < nproc; i++) {
    bufc[i*nblocks] = (double *) bufc_g[i];
    for (j = 1; j < nblocks; j++) {
      bufc[i*nblocks + j]  = bufc[i*nblocks + j-1] + block_size * block_size;
    }
  }

  /* barrier to ensure all initialization is done */
  MP_BARRIER();

  /* to remove cold-start misses, all processors touch their own data */
  touch_array(block_size, me);
  MP_BARRIER();

  if(doprint) {
    if(me == 0) {
      printf("Matrix before LU decomposition\n");
      print_array(me); 
    }
    MP_BARRIER();
  }  

  /* Starting the timer */
  if(me == 0) start_timer();

  lu(n, block_size, me);
  
  MP_BARRIER();

  /* Timer Stops here */
  if(me == 0) 
  printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time());

  if(doprint) {        
    if(me == 0) {
      printf("after LU\n");
      print_array(me);
    }
    MP_BARRIER();
  }
    
  /* done */
  ARMCI_Free(ptr[me]);
  ARMCI_Free(bufc_g[me]);
  ARMCI_Free(bufr_g[me]);
  ARMCI_Finalize();
  MP_FINALIZE();
}
示例#22
0
int main(int argc, char **argv) {
    int i, j, rank, nranks, peer, bufsize, errors;
    double **buffer, *src_buf;
    int count[2], src_stride, trg_stride, stride_level;

    MPI_Init(&argc, &argv);
    ARMCI_Init();

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    buffer = (double **) malloc(sizeof(double *) * nranks);

    bufsize = XDIM * YDIM * sizeof(double);
    ARMCI_Malloc((void **) buffer, bufsize);
    src_buf = ARMCI_Malloc_local(bufsize);

    if (rank == 0)
        printf("ARMCI Strided Put Test:\n");

    src_stride = XDIM * sizeof(double);
    trg_stride = XDIM * sizeof(double);
    stride_level = 1;

    count[1] = YDIM;
    count[0] = XDIM * sizeof(double);

    ARMCI_Barrier();

    peer = (rank+1) % nranks;

    for (i = 0; i < ITERATIONS; i++) {

      for (j = 0; j < XDIM*YDIM; j++) {
        *(src_buf + j) = rank + i;
      }

      ARMCI_PutS(
          src_buf,
          &src_stride,
          (void *) buffer[peer],
          &trg_stride,
          count,
          stride_level,
          peer);
    }

    ARMCI_Barrier();

    ARMCI_Access_begin(buffer[rank]);
    for (i = errors = 0; i < XDIM; i++) {
      for (j = 0; j < YDIM; j++) {
        const double actual   = *(buffer[rank] + i + j*XDIM);
        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS);
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);
          errors++;
          fflush(stdout);
        }
      }
    }
    ARMCI_Access_end(buffer[rank]);

    ARMCI_Free((void *) buffer[rank]);
    ARMCI_Free_local(src_buf);
    free(buffer);

    ARMCI_Finalize();
    MPI_Finalize();

    if (errors == 0) {
      printf("%d: Success\n", rank);
      return 0;
    } else {
      printf("%d: Fail\n", rank);
      return 1;
    }
}
示例#23
0
int main(int argc, char ** argv) {
  int    rank, nproc, i, test_iter;
  int   *my_data, *buf;
  void **base_ptrs;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting ARMCI test with %d processes\n", nproc);

  buf = malloc(DATA_SZ);
  base_ptrs = malloc(sizeof(void*)*nproc);

  for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) {
    if (rank == 0) printf(" + iteration %d\n", test_iter);

    /*** Allocate the shared array ***/
    ARMCI_Malloc(base_ptrs, DATA_SZ);
    my_data = base_ptrs[rank];

    /*** Get from our right neighbor and verify correct data ***/
    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank*test_iter;
    ARMCI_Access_end(my_data);

    ARMCI_Barrier(); // Wait for all updates to data to complete

    ARMCI_Get(base_ptrs[(rank+1) % nproc], buf, DATA_SZ, (rank+1) % nproc);

    for (i = 0; i < DATA_NELTS; i++) {
      if (buf[i] != ((rank+1) % nproc)*test_iter) {
        printf("%d: GET expected %d, got %d\n", rank, (rank+1) % nproc, buf[i]);
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }

    ARMCI_Barrier(); // Wait for all gets to complete

    /*** Put to our left neighbor and verify correct data ***/
    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank*test_iter;
    ARMCI_Put(buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);

    ARMCI_Barrier(); // Wait for all updates to data to complete

    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) {
      if (my_data[i] != ((rank+1) % nproc)*test_iter) {
        printf("%d: PUT expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    ARMCI_Access_end(my_data);

    ARMCI_Barrier(); // Wait for all gets to complete

    /*** Accumulate to our left neighbor and verify correct data ***/
    for (i = 0; i < DATA_NELTS; i++) buf[i] = rank;
    
    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank;
    ARMCI_Access_end(my_data);
    ARMCI_Barrier();

    int scale = test_iter;
    ARMCI_Acc(ARMCI_ACC_INT, &scale, buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc);

    ARMCI_Barrier(); // Wait for all updates to data to complete

    ARMCI_Access_begin(my_data);
    for (i = 0; i < DATA_NELTS; i++) {
      if (my_data[i] != rank + ((rank+1) % nproc)*test_iter) {
        printf("%d: ACC expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]);
        //MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    ARMCI_Access_end(my_data);

    ARMCI_Free(my_data);
  }

  free(buf);
  free(base_ptrs);

  if (rank == 0) printf("Test complete: PASS.\n");

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
示例#24
0
文件: simple.c 项目: v4m4/armci-mpi
int main(int argc, char **argv) {
  int i;
  double **myptrs;
  double t0, t1, tnbget=0, tnbwait=0, t2=0;

  MP_INIT(argc,argv);
  ARMCI_Init();

  MP_PROCS(&nprocs);
  MP_MYID(&me);

  if (nprocs < 2)
    ARMCI_Error("This program requires at least to processes", 1);

  myptrs = (double **)malloc(sizeof(double *)*nprocs);
  ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); 
  
  MP_BARRIER();
  
  if(me == 0) {
    for(i = 0; i < 10; i++) {
      // This is a bug:
      // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1);
      ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1);
    }

    t0 = MP_TIMER(); 
    for(i = 0; i < LOOP; i++) {
      // This is a bug:
      // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1);
      ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1);
    }
    t1 = MP_TIMER(); 

    printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP);
    fflush(stdout);

    t1 = t0 = 0;

    for(i = 0; i < LOOP; i++) {
      armci_hdl_t nbh;
      ARMCI_INIT_HANDLE(&nbh);

      t0 = MP_TIMER(); 
      //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh);
      ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh);
      t1 = MP_TIMER(); 
      ARMCI_Wait(&nbh);
      t2 = MP_TIMER();

      tnbget  += (t1-t0);
      tnbwait += (t2-t1);
    }

    printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout);
  }

  else
    sleep(1);

  MP_BARRIER();

  ARMCI_Finalize();
  MP_FINALIZE();

  return 0;
}
示例#25
0
int main(int argc, char **argv)
{
int i,peer,j;
cpu_set_t mycpuid,new_mask;
char str[CPU_SETSIZE];
int rrr;
char cid[8];
extern char * cpuset_to_cstr(cpu_set_t *mask, char *str);
extern int cstr_to_cpuset(cpu_set_t *mask, const char* str);
gpc_hdl_t nbh;
char rheader[100];
int hlen, rhlen, rhsize;
int rdsize;
int rem;
void *header=&rem;
int locval=0;
void *loc=&locval;
int right;

    MPI_Init(&argc,&argv);
    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
    if(nprocs<2){
      printf("\ncan run only on >=2 procs\n");
      MPI_Finalize();
      exit(1);
    }     
    right = (me+1)%nprocs; 
    hlen=sizeof(header);
    bzero(rheader,100);
    rhlen = hlen;

    ARMCI_Init();
    accloop=atoi(argv[1]);
    rem=accloop;
    myptrs = (char **)malloc(sizeof(char *)*nprocs);
    ARMCI_Malloc((void **)myptrs,size);

    MPI_Barrier(MPI_COMM_WORLD);

    gpcwork_memcpy = ARMCI_Gpc_register(gpc_work_handler_memcpy);
    gpcwork_ddot =ARMCI_Gpc_register(gpc_work_handler_ddot);
    gpcwork_daxpy = ARMCI_Gpc_register(gpc_work_handler_daxpy);
    gpcwork_dgemm = ARMCI_Gpc_register(gpc_work_handler_dgemm);
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);

    ARMCI_Gpc_init_handle(&nbh);
    if(ARMCI_Gpc_exec(gpcwork_memcpy, right, &header, hlen, loc, sizeof(int), 
                            rheader, rhlen,loc, sizeof(int), &nbh))
       fprintf(stderr,"ARMCI_Gpc_exec failed\n");
    {
      int m,n,k;
      char notr='n';
      DoubleComplex ZERO;
      usleep(100);
      ZERO.real=0.;ZERO.imag=0.;
      m=n=k=DGS;
      t0=MPI_Wtime();
#ifdef DGEMM_WORK
      for(j=0;j<4*15;j++){
         c_alpha=c_alpha+j*rand();
         dgemm_(&notr,&notr,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1);
      }
#elif IUNIT_WORK 
      for(j=0;j<2*LOOP*100;j++){
        for(i=0;i<LOOP*100;i++){
          tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2];
        }
      }
#elif DAXPY_WORK
      for(j=0;j<tmp_loop*80;j++){
        alpha=alpha+j*rand();
        daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE);
      }
#endif

      t1=MPI_Wtime()-t0;
      printf("\n%d:Compute_During_memcpy %d %f\n",me,accloop,t1);
    }

    ARMCI_Gpc_wait(&nbh);
    
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);

    ARMCI_Gpc_init_handle(&nbh);
    if(ARMCI_Gpc_exec(gpcwork_ddot, right, &header, hlen, loc, sizeof(int), 
                            rheader, rhlen,loc, sizeof(int), &nbh))
       fprintf(stderr,"ARMCI_Gpc_exec failed\n");
    {
      int m,n,k;
      char notr='n';
      DoubleComplex ZERO;
      usleep(100);
      ZERO.real=0.;ZERO.imag=0.;
      m=n=k=DGS;
      t0=MPI_Wtime();
#ifdef DGEMM_WORK
      for(j=0;j<4*15;j++){
         c_alpha=c_alpha+j*rand();
         dgemm_(&notr,&notr,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1);
      }
#elif IUNIT_WORK 
      for(j=0;j<2*LOOP*100;j++){
        for(i=0;i<LOOP*100;i++){
          tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2];
        }
      }
#elif DAXPY_WORK
      for(j=0;j<tmp_loop*80;j++){
        alpha=alpha+j*rand();
        daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE);
      }
#endif
      t1=MPI_Wtime()-t0;
      printf("\n%d:Compute_During_Ddot %d %f\n",me,accloop,t1);
    }
    ARMCI_Gpc_wait(&nbh);

    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);

    ARMCI_Gpc_init_handle(&nbh);
    if(ARMCI_Gpc_exec(gpcwork_daxpy, right, &header, hlen, loc, sizeof(int), 
                            rheader, rhlen,loc, sizeof(int), &nbh))
       fprintf(stderr,"ARMCI_Gpc_exec failed\n");
    {
      int m,n,k;
      char notr='n';
      DoubleComplex ZERO;
      usleep(100);
      ZERO.real=0.;ZERO.imag=0.;
      m=n=k=DGS;
      t0=MPI_Wtime();
#ifdef DGEMM_WORK
      for(j=0;j<4*15;j++){
         c_alpha=c_alpha+j*rand();
         dgemm_(&notr,&notr,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1);
      }
#elif IUNIT_WORK 
      for(j=0;j<2*LOOP*100;j++){
        for(i=0;i<LOOP*100;i++){
          tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2];
        }
      }
#elif DAXPY_WORK
      for(j=0;j<tmp_loop*80;j++){
        alpha=alpha+j*rand();
        daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE);
      }
#endif
      t1=MPI_Wtime()-t0;
      printf("\n%d:Compute_During_Daxpy %d %f\n",me,accloop,t1);
    }
    ARMCI_Gpc_wait(&nbh);

    MPI_Barrier(MPI_COMM_WORLD);

    ARMCI_Gpc_init_handle(&nbh);
    if(ARMCI_Gpc_exec(gpcwork_dgemm, right, &header, hlen, loc, sizeof(int), 
                            rheader, rhlen,loc, sizeof(int), &nbh))
       fprintf(stderr,"ARMCI_Gpc_exec failed\n");
    {
      int m,n,k;
      char notr='n';
      DoubleComplex ZERO;
      usleep(100);
      ZERO.real=0.;ZERO.imag=0.;
      m=n=k=DGS;
      t0=MPI_Wtime();
#ifdef DGEMM_WORK
      for(j=0;j<4*15;j++){
         c_alpha=c_alpha+j*rand();
         dgemm_(&notr,&notr,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1);
      }
#elif IUNIT_WORK 
      for(j=0;j<2*LOOP*100;j++){
        for(i=0;i<LOOP*100;i++){
          tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2];
        }
      }
#elif DAXPY_WORK
      for(j=0;j<tmp_loop*80;j++){
        alpha=alpha+j*rand();
        daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE);
      }
#endif
      t1=MPI_Wtime()-t0;
      printf("\n%d:Compute_During_Dgemm %d %f\n",me,accloop,t1);
    }
    ARMCI_Gpc_wait(&nbh);

    MPI_Barrier(MPI_COMM_WORLD);

    ARMCI_AllFence();

    ARMCI_Finalize();
    MPI_Finalize();
}
示例#26
0
int main(int argc, char **argv) {
  int                      me, nproc;
  int                      i, *procs;
  ARMCI_Group              g_world, g_odd, g_even;

  MPI_Init(&argc, &argv);
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  procs = malloc(sizeof(int) * ( nproc/2 + (nproc % 2 ? 1 : 0 )));

  if (me == 0) printf("ARMCI Group test starting on %d procs\n", nproc);

  ARMCI_Group_get_world(&g_world);
  
  if (me == 0) printf(" + Creating odd group\n");

  for (i = 1; i < nproc; i += 2) {
    procs[i/2] = i;
  }

  ARMCI_Group_create_child(i/2, procs, &g_odd, &g_world);

  if (me == 0) printf(" + Creating even group\n");

  for (i = 0; i < nproc; i += 2) {
    procs[i/2] = i;
  }

  ARMCI_Group_create_child(i/2, procs, &g_even, &g_world);

  /***********************************************************************/
  {
    int    grp_me, grp_nproc;
    double t_abs_to_grp, t_grp_to_abs;
    const int iter = 1000000;

    if (me == 0) {
      ARMCI_Group_rank(&g_even, &grp_me);
      ARMCI_Group_size(&g_even, &grp_nproc);

      t_abs_to_grp = MPI_Wtime();

      for (i = 0; i < iter; i++)
        ARMCII_Translate_absolute_to_group(&g_even, (grp_me+1) % grp_nproc);

      t_abs_to_grp = MPI_Wtime() - t_abs_to_grp;

      t_grp_to_abs = MPI_Wtime();

      for (i = 0; i < iter; i++)
        ARMCI_Absolute_id(&g_even, (grp_me+1) % grp_nproc);

      t_grp_to_abs = MPI_Wtime() - t_grp_to_abs;

      printf("t_abs_to_grp = %f us, t_grp_to_abs = %f us\n", t_abs_to_grp/iter * 1.0e6, t_grp_to_abs/iter * 1.0e6);
    }

    ARMCI_Barrier();
  }
  /***********************************************************************/

  if (me == 0) printf(" + Freeing groups\n");

  if (me % 2 > 0)
    ARMCI_Group_free(&g_odd);
  else
    ARMCI_Group_free(&g_even);

  free(procs);

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}
示例#27
0
int main(int argc, char *argv[])
{
    int i, j;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    
    /* ARMCI */
    void **ptr;
    double **ptr_loc;
    
    MP_INIT(argc,argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);
    
    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
            case 'n': n = atoi(optarg); break;
            case 'b': block_size = atoi(optarg); break;
            case 'p': nproc = atoi(optarg); break;
            case 'h': {
                printf("Usage: LU, or \n");
        printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
                MP_BARRIER();
                MP_FINALIZE();
                exit(0);
            }            
        }
    }
    
    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }
    
/*      num_rows = (int) sqrt((double) nproc); */
/*      for (;;) { */
/*          num_cols = nproc/num_rows; */
/*          if (num_rows*num_cols == nproc) */
/*              break; */
/*          num_rows--; */
/*      } */
    
    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }

    nnodes = nproc / 4;
    if((nnodes * 4) != nproc) {
        num_cols = nproc - nnodes * 4;
        nnodes++;
        num_rows = 1;
    }
    else {
        num_cols = 2;
        num_rows = 2;
    }    
    
    num = (nblocks * nblocks)/nnodes;
    if((num * nnodes) != (nblocks * nblocks))
        num++;

#ifdef DEBUG
    if(me == 0)
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) 
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    MP_BARRIER();
    MP_FINALIZE();
    exit(0);
#endif
    
    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }
    
    for (i=0;i<nblocks;i++) {
        for (j=0;j<nblocks;j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }
    
    /* initialize ARMCI */
    ARMCI_Init();
    ptr = (void **)malloc(nproc * sizeof(void *));
    ARMCI_Malloc(ptr, proc_bytes);
    
    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    } 
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks;i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }
    
    /* initialize the array */
    init_array();
    
    /* barrier to ensure all initialization is done */
    MP_BARRIER();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    MP_BARRIER();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me); 
        }
        MP_BARRIER();
    }
    

    /* Starting the timer */
    if(me == 0) start_timer();

    lu(n, block_size, me);
    
    MP_BARRIER();

    /* Timer Stops here */
    if(me == 0) 
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time());

    if(doprint) {        
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        MP_BARRIER();
    }
    
    /* done */
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
    MP_FINALIZE();

    return 0;
}
示例#28
0
文件: matmul.c 项目: arnolda/scafacos
int main(int argc, char **argv)
{
	int me,nproc;
    int status;
    int rank;

    /* initialization */
    MPI_Init(&argc, &argv);
    ARMCI_Init();

#ifdef HPC_PROFILING
    HPM_Init();
#endif

    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    MPI_Comm_size(MPI_COMM_WORLD,&nproc);

#ifdef DEBUG
    if(me == 0){
       printf("The result of MPI_Comm_size is %d\n",nproc);
       fflush(stdout);
    }
#endif

    /* get the matrix parameters */
    if (argc > 1){
        rank = atoi(argv[1]);
    } else {
        rank = 8;
    }
    if (me == 0){
        printf("Running matmul.x with rank = %d\n",rank);
        fflush(stdout);
    }

    /* register remote pointers */
    double** addr_A = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc);
    if (addr_A == NULL) ARMCI_Error("malloc A failed at line",0);

    double** addr_B = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc);
    if (addr_B == NULL) ARMCI_Error("malloc B failed at line",0);

    double** addr_C = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc);
    if (addr_C == NULL) ARMCI_Error("malloc C failed at line",0);

#ifdef DEBUG
    if(me == 0) printf("ARMCI_Malloc A requests %lu bytes\n",rank*rank*sizeof(double));
    fflush(stdout);
#endif
    status = ARMCI_Malloc((void **) addr_A, rank*rank*sizeof(double));
    if (status != 0) ARMCI_Error("ARMCI_Malloc A failed",status);

#ifdef DEBUG
    if(me == 0) printf("ARMCI_Malloc B requests %lu bytes\n",rank*rank*sizeof(double));
    fflush(stdout);
#endif
    status = ARMCI_Malloc((void **) addr_B, rank*rank*sizeof(double));
    if (status != 0) ARMCI_Error("ARMCI_Malloc B failed",status);

#ifdef DEBUG
    if(me == 0) printf("ARMCI_Malloc C requests %lu bytes\n",rank*rank*sizeof(double));
    fflush(stdout);
#endif
    status = ARMCI_Malloc((void **) addr_C, rank*rank*sizeof(double));
    if (status != 0) ARMCI_Error("ARMCI_Malloc C failed",status);

    MPI_Barrier(MPI_COMM_WORLD);

    /* free ARMCI pointers */
    ARMCI_Free_local(addr_C);
    ARMCI_Free_local(addr_B);
    ARMCI_Free_local(addr_A);

#ifdef HPC_PROFILING
    HPM_Print();
#endif

    /* the end */
    ARMCI_Finalize();
    MPI_Finalize();

    return(0);
}
示例#29
0
int main(int argc, char *argv[])
{
    int ch;
    extern char *optarg;
    int i, j, r;
    thread_t threads[MAX_TPP];

    /* init MP */
    MP_INIT(argc,argv);
    MP_PROCS(&size);
    MP_MYID(&rank);

    while ((ch = getopt(argc, argv, "t:s:i:d:h")) != -1) {
        switch(ch) {
            case 't': /* # of threads */
                tpp = atoi(optarg);
                if (tpp < 1 || tpp > MAX_TPP) {
                    PRINTF0("\"%s\" is improper value for -t, should be a "
                            "number between 1 and %d(MAX_TPP)\n",
                            optarg, MAX_TPP);
                    usage();
                }
                break;
            case 'i': /* # of iterations */
                iters = atoi(optarg);
                if (iters < 1) {
                    PRINTF0("\"%s\" is improper value for -t, should be a "
                            "number equal or larger than 1\n", optarg);
                    usage();
                }
                break;
            case 's': /* # of elements in the array */
                asize = atoi(optarg);
                if (iters < 1) {
                    PRINTF0("\"%s\" is improper value for -s, should be a "
                            "number equal or larger than 1\n", optarg);
                    usage();
                }
                break;
            case 'd': delay = atoi(optarg); break; /* delay before start */
            case 'h': usage(); break; /* print usage info */
        }
    }
#ifdef NOTHREADS
    tpp = 1;
    PRINTF0("Warning: NOTHREADS debug symbol is set -- running w/o threads\n");
#endif
    th_size = size * tpp;
    PRINTF0("\nTest of multi-threaded capabilities:\n"
            "%d threads per process (%d threads total),\n"
            "%d array elements of size %d,\n"
            "%d iteration(s)\n\n", tpp, th_size, asize, sizeof(atype_t), iters);
    if (delay) {
        printf("%d: %d\n", rank, getpid());
        fflush(stdout);
        sleep(delay);
        MP_BARRIER();
    }
    TH_INIT(size,tpp);
    for (i = 0; i < tpp; i++) th_rank[i] = rank * tpp + i;

#if defined(DEBUG) && defined(LOG2FILE)
    for (i = 0; i < tpp; i++) {
        fname[10] = '0' + th_rank[i] / 100;
        fname[11] = '0' + th_rank[i] % 100 / 10;
        fname[12] = '0' + th_rank[i] % 10;
        dbg[i] = fopen(fname, "w");
    }
#endif
    for (i = 0; i < tpp; i++)
        prndbg(i, "proc %d, thread %d(%d):\n", rank, i, th_rank[i]);

    /* init ARMCI */
    ARMCI_Init();

    /* set global seed (to ensure same random sequence across procs) */
    time_seed = (unsigned)time(NULL);
    armci_msg_brdcst(&time_seed, sizeof(time_seed), 0);
    srand(time_seed); rand();
    prndbg(0, "seed = %u\n", time_seed);
    /* random pairs */
    pairs = calloc(th_size, sizeof(int));
    for (i = 0; i < th_size; i++) pairs[i] = -1;
    for (i = 0; i < th_size; i++) {
        if (pairs[i] != -1) continue;
        r = RND(0, th_size);
        while (i == r || pairs[r] != -1 ) r = RND(0, th_size);
        pairs[i] = r; pairs[r] = i;
    }
    for (i = 0, cbufl = 0; i < th_size; i++)
        cbufl += sprintf(cbuf + cbufl, " %d->%d|%d->%d",
                         i, pairs[i], pairs[i], pairs[pairs[i]]);
    prndbg(0, "random pairs:%s\n", cbuf);
    /* random targets */
    rnd_tgts = calloc(th_size, sizeof(int));
    for (i = 0, cbufl = 0; i < th_size; i++) {
        rnd_tgts[i] = RND(0, th_size);
        if (rnd_tgts[i] == i) { i--; continue; }
        cbufl += sprintf(cbuf + cbufl, " %d", rnd_tgts[i]);
    }
    prndbg(0, "random targets:%s\n", cbuf);
    /* random one */
    rnd_one = RND(0, th_size);
    prndbg(0, "random one = %d\n", rnd_one);

    assert(ptrs1 = calloc(th_size, sizeof(void *)));
    assert(ptrs2 = calloc(th_size, sizeof(void *)));
#ifdef NOTHREADS
    thread_main((void *)(long)0);
#else
    for (i = 0; i < tpp; i++) THREAD_CREATE(threads + i, thread_main, (void *)(long)i);
    for (i = 0; i < tpp; i++) THREAD_JOIN(threads[i], NULL);
#endif

    MP_BARRIER();
    PRINTF0("Tests Completed\n");

    /* clean up */
#if defined(DEBUG) && defined(LOG2FILE)
    for (i = 0; i < tpp; i++) fclose(dbg[i]);
#endif
    ARMCI_Finalize();
    TH_FINALIZE();
    MP_FINALIZE();

	return 0;
}