예제 #1
0
파일: testnotify.c 프로젝트: jeffhammond/ga
int main(int argc, char *argv[])
{
  int ndim;

  armci_msg_init(&argc, &argv);
  ARMCI_Init_args(&argc, &argv);
  nproc = armci_msg_nproc();
  me = armci_msg_me();

  ARMCI_Barrier();
  if (me == 0) {
    printf("\nTesting armci_notify\n");
    fflush(stdout);
    sleep(1);
  }
  ARMCI_Barrier();

  for (ndim = 1; ndim <= MAXDIMS; ndim++) {
    test_notify(ndim);
  }
  ARMCI_Barrier();

  ARMCI_Finalize();
  armci_msg_finalize();
  return(0);
}
예제 #2
0
파일: transp1D-c.c 프로젝트: jeffhammond/ga
int main(int argc, char **argv) {
    /* int heap=300000, stack=300000; */
    int me, nprocs;
    
    /* Step1: Initialize Message Passing library */
    armci_msg_init(&argc, &argv);

    /* Step2: Initialize ARMCI */
    ARMCI_Init();
    
    /* Step3: Initialize Memory Allocator (MA) */
    /*bjp
    if(! MA_init(C_DBL, stack, heap) ) ARMCI_Error("MA_init failed",stack+heap);
    */

    me     = armci_msg_me();
    nprocs = armci_msg_nproc();
    if(me==0) {
       printf("\nUsing %d processes\n\n", nprocs); fflush(stdout);
    }
    
       
    TRANSPOSE1D();
    
    if(me==0)printf("\nTerminating ..\n");
    ARMCI_Finalize();
    
    armci_msg_finalize();    
    return(0);
}
예제 #3
0
파일: perf2.c 프로젝트: jeffhammond/ga
int main(int argc, char **argv)
{
    armci_msg_init(&argc,&argv);
    ARMCI_Init_args(&argc, &argv);
    me = armci_msg_me();
    nproc = armci_msg_nproc();

    /* This test only works for two processes */

    assert(nproc == 2);

    if (0 == me) {
        printf("msg size (bytes)     avg time (us)    avg b/w (MB/sec)\n");
    }

    if (0 == me) {
        printf("#PNNL comex Put Test\n");
    }
    contig_test(MAX_MESSAGE_SIZE, PUT);

    if (0 == me) {
        printf("#PNNL comex Get Test\n");
    }
    contig_test(MAX_MESSAGE_SIZE, GET);
   
    if (0 == me) {
        printf("#PNNL comex Accumulate Test\n");
    }
    contig_test(MAX_MESSAGE_SIZE, ACC);
    
    ARMCI_Finalize();
    armci_msg_finalize();

    return 0;
}
예제 #4
0
void usage()
{
  if (!rank) {
    printf("Usage: test_mt, or \n");
    printf("       test_mt -tTHREADS_PER_PROC -sARRAY_SIZE -iITERATIONS_COUNT\n");
  }
  ARMCI_Barrier();
  armci_msg_finalize();
  exit(0);
}
int main(int argc, char* argv[])
{

    armci_msg_init(&argc, &argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();

    /*    printf("nproc = %d, me = %d\n", nproc, me);*/

    if(nproc>MAXPROC && me==0)
        ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0) {
        printf("ARMCI test program (%d processes)\n",nproc);
        fflush(stdout);
        sleep(1);
    }

    ARMCI_Init();

    if(me==0) {
        printf("\n  Performing Sparse Matrix-Vector Multiplication ...\n\n");
        fflush(stdout);
    }
    test_sparse();

    ARMCI_AllFence();
    armci_msg_barrier();
    if(me==0) {
        printf("\nSuccess!!\n");
        fflush(stdout);
    }
    sleep(2);

    armci_msg_barrier();
    ARMCI_Finalize();
    armci_msg_finalize();
    return(0);
}
예제 #6
0
int main(int argc, char *argv[])
{
  ARMCI_Init_args(&argc, &argv);
  nproc = armci_msg_nproc();
  me = armci_msg_me();

  /*    printf("nproc = %d, me = %d\n", nproc, me);*/

  if (nproc > MAXPROC && me == 0) {
    ARMCI_Error("Test works for up to %d processors\n", MAXPROC);
  }

  if (me == 0) {
    printf("ARMCI test program (%d processes)\n", nproc);
    fflush(stdout);
    sleep(1);
  }

  if (me == 0) {
    printf("\nAggregate put/get requests\n\n");
    fflush(stdout);
  }
  test_aggregate(1); /* cold start */
  test_aggregate(0); /* warm start */

  ARMCI_AllFence();
  ARMCI_Barrier();
  if (me == 0) {
    printf("\nSuccess!!\n");
    fflush(stdout);
  }
  sleep(2);

  ARMCI_Barrier();
  ARMCI_Finalize();
  armci_msg_finalize();
  return(0);
}
예제 #7
0
int main(int argc, char *argv[])
{
  int ch;
  extern char *optarg;
  int i, j, r;
  thread_t threads[MAX_TPP];

  /* init ARMCI */
  ARMCI_Init_args(&argc, &argv);
  size = armci_msg_nproc();
  rank = armci_msg_me();

  while ((ch = getopt(argc, argv, "t:s:i:d:h")) != -1) {
    switch (ch) {
      case 't': /* # of threads */
        tpp = atoi(optarg);
        if (tpp < 1 || tpp > MAX_TPP) {
          PRINTF0("\"%s\" is improper value for -t, should be a "
                  "number between 1 and %d(MAX_TPP)\n",
                  optarg, MAX_TPP);
          usage();
        }
        break;
      case 'i': /* # of iterations */
        iters = atoi(optarg);
        if (iters < 1) {
          PRINTF0("\"%s\" is improper value for -t, should be a "
                  "number equal or larger than 1\n", optarg);
          usage();
        }
        break;
      case 's': /* # of elements in the array */
        asize = atoi(optarg);
        if (iters < 1) {
          PRINTF0("\"%s\" is improper value for -s, should be a "
                  "number equal or larger than 1\n", optarg);
          usage();
        }
        break;
      case 'd':
        delay = atoi(optarg);
        break; /* delay before start */
      case 'h':
        usage();
        break; /* print usage info */
    }
  }
#ifdef NOTHREADS
  tpp = 1;
  PRINTF0("Warning: NOTHREADS debug symbol is set -- running w/o threads\n");
#endif
  th_size = size * tpp;
  PRINTF0("\nTest of multi-threaded capabilities:\n"
          "%d threads per process (%d threads total),\n"
          "%d array elements of size %d,\n"
          "%d iteration(s)\n\n", tpp, th_size, asize, sizeof(atype_t), iters);
  if (delay) {
    printf("%d: %d\n", rank, getpid());
    fflush(stdout);
    sleep(delay);
    ARMCI_Barrier();
  }
  TH_INIT(size, tpp);
  for (i = 0; i < tpp; i++) {
    th_rank[i] = rank * tpp + i;
  }

#if defined(DEBUG) && defined(LOG2FILE)
  for (i = 0; i < tpp; i++) {
    fname[10] = '0' + th_rank[i] / 100;
    fname[11] = '0' + th_rank[i] % 100 / 10;
    fname[12] = '0' + th_rank[i] % 10;
    dbg[i] = fopen(fname, "w");
  }
#endif
  for (i = 0; i < tpp; i++) {
    prndbg(i, "proc %d, thread %d(%d):\n", rank, i, th_rank[i]);
  }

  /* set global seed (to ensure same random sequence across procs) */
  time_seed = (unsigned)time(NULL);
  armci_msg_brdcst(&time_seed, sizeof(time_seed), 0);
  srand(time_seed);
  rand();
  prndbg(0, "seed = %u\n", time_seed);
  /* random pairs */
  pairs = calloc(th_size, sizeof(int));
  for (i = 0; i < th_size; i++) {
    pairs[i] = -1;
  }
  for (i = 0; i < th_size; i++) {
    if (pairs[i] != -1) {
      continue;
    }
    r = RND(0, th_size);
    while (i == r || pairs[r] != -1) {
      r = RND(0, th_size);
    }
    pairs[i] = r;
    pairs[r] = i;
  }
  for (i = 0, cbufl = 0; i < th_size; i++)
    cbufl += sprintf(cbuf + cbufl, " %d->%d|%d->%d",
                     i, pairs[i], pairs[i], pairs[pairs[i]]);
  prndbg(0, "random pairs:%s\n", cbuf);
  /* random targets */
  rnd_tgts = calloc(th_size, sizeof(int));
  for (i = 0, cbufl = 0; i < th_size; i++) {
    rnd_tgts[i] = RND(0, th_size);
    if (rnd_tgts[i] == i) {
      i--;
      continue;
    }
    cbufl += sprintf(cbuf + cbufl, " %d", rnd_tgts[i]);
  }
  prndbg(0, "random targets:%s\n", cbuf);
  /* random one */
  rnd_one = RND(0, th_size);
  prndbg(0, "random one = %d\n", rnd_one);

  assert(ptrs1 = calloc(th_size, sizeof(void *)));
  assert(ptrs2 = calloc(th_size, sizeof(void *)));
#ifdef NOTHREADS
  thread_main((void *)(long)0);
#else
  for (i = 0; i < tpp; i++) {
    THREAD_CREATE(threads + i, thread_main, (void *)(long)i);
  }
  for (i = 0; i < tpp; i++) {
    THREAD_JOIN(threads[i], NULL);
  }
#endif

  ARMCI_Barrier();
  PRINTF0("Tests Completed\n");

  /* clean up */
#if defined(DEBUG) && defined(LOG2FILE)
  for (i = 0; i < tpp; i++) {
    fclose(dbg[i]);
  }
#endif
  ARMCI_Finalize();
  TH_FINALIZE();
  armci_msg_finalize();

  return 0;
}
예제 #8
0
파일: lu.c 프로젝트: jeffhammond/ga
int main(int argc, char *argv[])
{
    int i, j;
    int ch;
    int edge;
    int size;
    int nloop=5;
    double **ptr_loc;
    
    armci_msg_init(&argc,&argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();
    
    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
            case 'n': n = atoi(optarg); break;
            case 'b': block_size = atoi(optarg); break;
            case 'p': nproc = atoi(optarg); break;
            case 'h': {
                printf("Usage: LU, or \n");
        printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
                armci_msg_barrier();
                armci_msg_finalize();
                exit(0);
            }            
        }
    }
    
    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }
    
    num_rows = (int) sqrt((double) nproc);
    for (;;) {
        num_cols = nproc/num_rows;
        if (num_rows*num_cols == nproc)
            break;
        num_rows--;
    }
    
    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }
    
    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }
    
#ifdef DEBUG
    if(me == 0)
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) 
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    armci_msg_barrier();
    armci_msg_finalize();
    exit(0);
#endif
    
    for (i=0;i<nblocks;i++) {
        for (j=0;j<nblocks;j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }
    
    ptr = (void **)malloc(nproc * sizeof(void *));
#ifdef MPI2_ONESIDED
    MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]);
    MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL,
                   MPI_COMM_WORLD, &win);
    for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me];
    MPI_Barrier(MPI_COMM_WORLD);
    
#else
    /* initialize ARMCI */
    ARMCI_Init();
    ARMCI_Malloc(ptr, proc_bytes);
#endif
    
    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    } 
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks;i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }
    
    /* initialize the array */
    init_array();
    
    /* barrier to ensure all initialization is done */
    armci_msg_barrier();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    armci_msg_barrier();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me); 
        }
        armci_msg_barrier();
    }
    
    lu(n, block_size, me); /* cold start */

    /* Starting the timer */

    armci_msg_barrier();
    if(me == 0) start_timer();
    for(i=0; i<nloop; i++) lu(n, block_size, me);    
    armci_msg_barrier();

    /* Timer Stops here */
    if(me == 0) 
        printf("\nRunning time = %f milliseconds.\n\n",  elapsed_time()/nloop);
    printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop);
    
    if(doprint) {        
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        armci_msg_barrier();
    }
    
    /* done */
#ifdef MPI2_ONESIDED
    MPI_Win_free(&win);
    MPI_Free_mem(ptr[me]);
#else
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
#endif
    armci_msg_finalize();

    return 0;
}
예제 #9
0
int main(int argc, char *argv[])
{
  int rc, i, j = 0, rid, ret;
  armci_ckpt_ds_t ckptds;
  ARMCI_Group grp;

  ARMCI_Init_args(&argc, &argv);
  nproc = armci_msg_nproc();
  me = armci_msg_me();

  if (me == 0) {
    if (nproc > MAXPROCS) {
      ARMCI_Error("nproc > MAXPROCS", nproc);
    }
    else {
      printf("ARMCI test program (%d processes)\n", nproc);
      fflush(stdout);
      sleep(1);
    }

  }
  armci_init_checkpoint2();
  ARMCI_Group_get_world(&grp);
  size = SIZE_;
  rc = ARMCI_Malloc((void **)ptr_arr, size * 8);
  printf("ARMCI test program (%d processes)\n", nproc);
  fflush(stdout);
  for (size = 1; size <= SIZE_; size *= 2) {
    t1 = MPI_Wtime();
    for (i = 0; i < 5; i++) {
      for (rc = 0; rc < 15; rc++) {
        do_work(size);
      }
    }
    time_array[j++] = MPI_Wtime() - t1;
    ARMCI_Barrier();
    printf("%d:done for size %ld\n", me, size);
    fflush(stdout);
  }

  (void)ARMCI_Ckpt_create_ds(&ckptds, 1);
  ckptds.ptr_arr[0] = ptr_arr[me];
  ckptds.sz[0] = SIZE_ * 8;
  rid = ARMCI_Ckpt_init(NULL, &grp, 1, 0, &ckptds);
  printf("%d: After ARMCI_Ckpt_init(): \n", me);

  j = 0;
  for (size = 128; size <= SIZE_; size *= 2) {

    int rc;
    int simulate_restart = 1;
    t1 = MPI_Wtime();

    ret = ARMCI_Ckpt(rid);
    if (ret == ARMCI_CKPT) {
      printf("%d: Performed CHECKPOINT @ size=%ld\n", me, size);
    }
    else if (ret == ARMCI_RESTART) {
      simulate_restart = 0;
      printf("%d: Performed RESTART @ size=%ld\n", me, size);
    }

    for (i = 0; i < 5; i++) {
      for (rc = 0; rc < 15; rc++)
        if (i == 3 && rc == 10) {
        }
      do_work(size);
    }

    time_array1[j++] = MPI_Wtime() - t1;
    sleep(1);

    if (simulate_restart && size == FAILURE_SIZE_) {
      printf("%d: Simulating FAILURE @ size = %d\n", me, size);
      ARMCI_Restart_simulate(rid, 1);
    }

    printf("%d: DONE for size=%ld regular=%f withckpt=%f\n\n",
           me, size, time_array[j-1], time_array1[j-1]);
    fflush(stdout);

  }

  ARMCI_Ckpt_finalize(rid);

  printf("Before Finalize()\n");
  ARMCI_Barrier();
  ARMCI_Finalize();
  armci_msg_finalize();
  return(0);
}
예제 #10
0
int main(int argc, char *argv[])
{
    int i, j;
    int ch;
    int edge;
    int size;

    /* ARMCI */
    void **ptr;
    double **ptr_loc;

    armci_msg_init(&argc,&argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();

    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
        case 'n':
            n = atoi(optarg);
            break;
        case 'b':
            block_size = atoi(optarg);
            break;
        case 'p':
            nproc = atoi(optarg);
            break;
        case 'h': {
            printf("Usage: LU, or \n");
            printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
            armci_msg_barrier();
            armci_msg_finalize();
            exit(0);
        }
        }
    }

    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }

    /*      num_rows = (int) sqrt((double) nproc); */
    /*      for (;;) { */
    /*          num_cols = nproc/num_rows; */
    /*          if (num_rows*num_cols == nproc) */
    /*              break; */
    /*          num_rows--; */
    /*      } */

    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }

    nnodes = nproc / 4;
    if((nnodes * 4) != nproc) {
        num_cols = nproc - nnodes * 4;
        nnodes++;
        num_rows = 1;
    }
    else {
        num_cols = 2;
        num_rows = 2;
    }

    num = (nblocks * nblocks)/nnodes;
    if((num * nnodes) != (nblocks * nblocks))
        num++;

#ifdef DEBUG
    if(me == 0)
        for (i=0; i<nblocks; i++) {
            for (j=0; j<nblocks; j++)
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    armci_msg_barrier();
    armci_msg_finalize();
    exit(0);
#endif

    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }

    for (i=0; i<nblocks; i++) {
        for (j=0; j<nblocks; j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }

    /* initialize ARMCI */
    ARMCI_Init_args(&argc, &argv);
    ptr = (void **)malloc(nproc * sizeof(void *));
    ARMCI_Malloc(ptr, proc_bytes);

    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    }
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks; i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }

    /* initialize the array */
    init_array();

    /* barrier to ensure all initialization is done */
    armci_msg_barrier();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    armci_msg_barrier();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me);
        }
        armci_msg_barrier();
    }


    /* Starting the timer */
    if(me == 0) start_timer();

    lu(n, block_size, me);

    armci_msg_barrier();

    /* Timer Stops here */
    if(me == 0)
        printf("\nRunning time = %f milliseconds.\n\n",  elapsed_time());

    if(doprint) {
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        armci_msg_barrier();
    }

    /* done */
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
    armci_msg_finalize();

    return 0;
}
예제 #11
0
파일: lu-block-th.c 프로젝트: bcernohous/ga
main(int argc, char *argv[])
{
    int i, j, l;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    int lu_arg[MAX_THREADS][3];
    
    /* ARMCI */
    void **ptr;
    double **ptr_loc;

    THREAD_LOCK_INIT(mutex);
    
    armci_msg_init(&argc,&argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();
    
    while ((ch = getopt(argc, argv, "n:b:p:t:d:h")) != -1) {
        switch(ch) {
            case 'n': n = atoi(optarg); break;
            case 'b': block_size = atoi(optarg); break;
            case 'p': nproc = atoi(optarg); break;
            case 't': th_per_p = atoi(optarg); break;
            case 'd': d = atoi(optarg); break;
            case 'h': {
                printf("Usage: LU, or \n");
        printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC -tTH_PER_P\n");
                armci_msg_barrier();
                armci_msg_finalize();
                exit(0);
            } 
        }
    }

    if(th_per_p>MAX_THREADS) {
        th_per_p=MAX_THREADS;
        if(me==0)printf("Warning: cannot run more than %d threads, adjust MAX_THREADS",MAX_THREADS);
    }

    if (d) {
        fprintf(stderr, "%d: %d\n", me, getpid());
        sleep(d);
    }

    nthreads = th_per_p * nproc;
    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d thread(s) per processor, %d threads total\n", th_per_p, nthreads);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }
    
    num_rows = (int) sqrt((double) nthreads);
    for (;;) {
        num_cols = nthreads/num_rows;
        if (num_rows*num_cols == nthreads)
            break;
        num_rows--;
    }
    
    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }
    
    num = (nblocks * nblocks)/nthreads;
    if((num * nthreads) != (nblocks * nblocks))
        num++;

    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }
#ifdef DEBUG
    if(me == 0)
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) 
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    armci_msg_barrier();
/*    armci_msg_finalize(); */
/*    exit(0); */
#endif
    
    for (l = 0; l < th_per_p; l++) {
        me_th[l] = me * th_per_p + l;
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) {
                if(block_owner(i,j) == me_th[l]) {
                    if ((i == nblocks-1) && (j == nblocks-1)) {
                        size = edge*edge;
                    }
                    else if ((i == nblocks-1) || (j == nblocks-1)) {
                        size = edge*block_size;
                    }
                    else {
                        size = block_size*block_size;
                    }
                    thread_doubles[l] += size;
                }
            }
        }
        proc_bytes += thread_doubles[l] * sizeof(double);
    }

    /* initialize ARMCI */
    ARMCI_Init();
    ptr = (void **)malloc(nproc * sizeof(void *));
    ARMCI_Malloc(ptr, proc_bytes);

    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    }
    ptr_loc = (double **)malloc(nthreads*sizeof(double *));
    for (i = 0; i < nproc; i++) {
        ptr_loc[i * th_per_p] = (double *)ptr[i];
        for (j = 1; j < th_per_p; j++)
            ptr_loc[i * th_per_p + j] = ptr_loc[i * th_per_p + j - 1] + thread_doubles[j - 1];
    }
    for(i=0; i<nblocks;i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }
#if 0
    for(i=0; i<nblocks*nblocks;i ++) printf("%d: a[%d]=%p\n", me, i, a[i]);
    fflush(stdout);
#endif
    
    /* initialize the array */
    init_array();
    
    /* barrier to ensure all initialization is done */
    armci_msg_barrier();

    /* to remove cold-start misses, all processors touch their own data */
/*    for (l = 0; l < th_per_p; l++) touch_array(block_size, me_th[l]); */
    armci_msg_barrier();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me); 
        }
        armci_msg_barrier();
    }

#if 1
    for (i = 0; i < nblocks; i++)
        for (j = 0; j < nblocks; j++)
            print_block_dbg(a[i + j * nblocks], "proc %d, a[%d, %d]:\n", me, i, j);
#endif

    TH_INIT(nproc,th_per_p);

    /* Starting the timer */
    if(me == 0) start_timer();

    for (l = 0; l < th_per_p; l++) {
        lu_arg[l][0] = n;
        lu_arg[l][1] = block_size;
        lu_arg[l][2] = l;
        THREAD_CREATE(threads + l, lu, lu_arg[l]);
    }
    
    for (l = 0; l < th_per_p; l++) THREAD_JOIN(threads[l], NULL);
    armci_msg_barrier();

    /* Timer Stops here */
    if(me == 0) 
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time());

    if(doprint) {        
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        armci_msg_barrier();
    }
    
    /* done */
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
    armci_msg_finalize();

    THREAD_LOCK_DESTROY(mutex);
}