Example #1
0
int main(int argc, char **argv) {
    /* int heap=300000, stack=300000; */
    int me, nprocs;
    
    /* Step1: Initialize Message Passing library */
    armci_msg_init(&argc, &argv);

    /* Step2: Initialize ARMCI */
    ARMCI_Init();
    
    /* Step3: Initialize Memory Allocator (MA) */
    /*bjp
    if(! MA_init(C_DBL, stack, heap) ) ARMCI_Error("MA_init failed",stack+heap);
    */

    me     = armci_msg_me();
    nprocs = armci_msg_nproc();
    if(me==0) {
       printf("\nUsing %d processes\n\n", nprocs); fflush(stdout);
    }
    
       
    TRANSPOSE1D();
    
    if(me==0)printf("\nTerminating ..\n");
    ARMCI_Finalize();
    
    armci_msg_finalize();    
    return(0);
}
Example #2
0
int main(int argc, char **argv)
{
    armci_msg_init(&argc,&argv);
    ARMCI_Init_args(&argc, &argv);
    me = armci_msg_me();
    nproc = armci_msg_nproc();

    /* This test only works for two processes */

    assert(nproc == 2);

    if (0 == me) {
        printf("msg size (bytes)     avg time (us)    avg b/w (MB/sec)\n");
    }

    if (0 == me) {
        printf("#PNNL comex Put Test\n");
    }
    contig_test(MAX_MESSAGE_SIZE, PUT);

    if (0 == me) {
        printf("#PNNL comex Get Test\n");
    }
    contig_test(MAX_MESSAGE_SIZE, GET);
   
    if (0 == me) {
        printf("#PNNL comex Accumulate Test\n");
    }
    contig_test(MAX_MESSAGE_SIZE, ACC);
    
    ARMCI_Finalize();
    armci_msg_finalize();

    return 0;
}
Example #3
0
int main(int argc, char *argv[])
{
  int ndim;

  armci_msg_init(&argc, &argv);
  ARMCI_Init_args(&argc, &argv);
  nproc = armci_msg_nproc();
  me = armci_msg_me();

  ARMCI_Barrier();
  if (me == 0) {
    printf("\nTesting armci_notify\n");
    fflush(stdout);
    sleep(1);
  }
  ARMCI_Barrier();

  for (ndim = 1; ndim <= MAXDIMS; ndim++) {
    test_notify(ndim);
  }
  ARMCI_Barrier();

  ARMCI_Finalize();
  armci_msg_finalize();
  return(0);
}
Example #4
0
void VERIFY(void **b_ptr, int *dims, int *map) {
    int i, j, length, icnt, ichk, lmin, lmax;
    int *buf, *b;
    void *src_ptr, *dst_ptr;
    int me, nprocs;

    /* Find local processor ID and number of processors */
    me     = armci_msg_me();
    nprocs = armci_msg_nproc();

    /* Process 0 verifies that inversion is correct. Start by allocating
       buffer and guarantee that it is big enough */

    length = (int)(((double)dims[0])/((double)nprocs)) + 1;
    buf = (int*)malloc(length*sizeof(int));
    if (me == 0) {
      icnt = 0;
      ichk = 0;
      for (i=0; i<nprocs; i++) {
        /* Find min and max indices owned by processor i */
        lmin = map[i];
        if (i<nprocs-1) {
          lmax = map[i+1]-1;
        } else {
          lmax = dims[0]-1;
        }
        /* evaluate parameters for get call */
        length = sizeof(int)*(lmax-lmin+1);
        src_ptr = b_ptr[i];
        dst_ptr = (void*)buf;
        ARMCI_Get(src_ptr, dst_ptr, length, i);

        /* check values in buffer */
        length = lmax-lmin+1;
        b = (int*)dst_ptr;
        for (j=0; j<length; j++) {
          /* printf("p[%d] b[%d]: %d\n",me,icnt,b[j]); */
          if (b[j] != dims[0] - icnt) {
            printf("Error found for element %d b: %d != a: %d\n",
                icnt,b[j],dims[0]-icnt);
            ichk = 1;
          }
          icnt++;
        }
      }
      if (ichk == 0) {
        printf("1D transpose successful. No errors found\n");
      } else {
        printf("1D transpose failed\n");
      }
    }
    free(buf);
}
Example #5
0
void armci_group_init() 
{
    int grp_me;
#ifdef ARMCI_GROUP
    int i;
#endif
    ARMCI_iGroup *igroup = (ARMCI_iGroup *)&ARMCI_World_Proc_Group;

#ifdef ARMCI_GROUP
    /*setup the world proc group*/

    /*
    MPI_Comm_size(MPI_COMM_WORLD, &igroup->grp_attr.nproc); 
    MPI_Comm_rank(MPI_COMM_WORLD, &igroup->grp_attr.grp_me); 
    */

    igroup->grp_attr.nproc = armci_msg_nproc();
    igroup->grp_attr.grp_me = armci_msg_me();

    igroup->grp_attr.proc_list = (int *)malloc(igroup->grp_attr.nproc*sizeof(int));
    assert(igroup->grp_attr.proc_list != NULL);
    for(i=0; i<igroup->grp_attr.nproc; i++) {
      igroup->grp_attr.proc_list[i] = i;
    } 
    igroup->grp_attr.grp_clus_info = NULL;
    armci_cache_attr((ARMCI_Group*)&ARMCI_World_Proc_Group);
#else
    /* save MPI world group and communicatior in ARMCI_World_Proc_Group */
    igroup->icomm = MPI_COMM_WORLD;
    MPI_Comm_group(MPI_COMM_WORLD, &(igroup->igroup));

    /* processes belong to this group should cache attributes */
    MPI_Group_rank((MPI_Group)(igroup->igroup), &grp_me);
    if(grp_me != MPI_UNDEFINED) 
    {
       armci_cache_attr((ARMCI_Group*)&ARMCI_World_Proc_Group);
    }
#endif    

    /* Initially, World group is the default group */
    ARMCI_Default_Proc_Group = ARMCI_World_Proc_Group;
}
int main(int argc, char* argv[])
{

    armci_msg_init(&argc, &argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();

    /*    printf("nproc = %d, me = %d\n", nproc, me);*/

    if(nproc>MAXPROC && me==0)
        ARMCI_Error("Test works for up to %d processors\n",MAXPROC);

    if(me==0) {
        printf("ARMCI test program (%d processes)\n",nproc);
        fflush(stdout);
        sleep(1);
    }

    ARMCI_Init();

    if(me==0) {
        printf("\n  Performing Sparse Matrix-Vector Multiplication ...\n\n");
        fflush(stdout);
    }
    test_sparse();

    ARMCI_AllFence();
    armci_msg_barrier();
    if(me==0) {
        printf("\nSuccess!!\n");
        fflush(stdout);
    }
    sleep(2);

    armci_msg_barrier();
    ARMCI_Finalize();
    armci_msg_finalize();
    return(0);
}
Example #7
0
int main(int argc, char *argv[])
{
  ARMCI_Init_args(&argc, &argv);
  nproc = armci_msg_nproc();
  me = armci_msg_me();

  /*    printf("nproc = %d, me = %d\n", nproc, me);*/

  if (nproc > MAXPROC && me == 0) {
    ARMCI_Error("Test works for up to %d processors\n", MAXPROC);
  }

  if (me == 0) {
    printf("ARMCI test program (%d processes)\n", nproc);
    fflush(stdout);
    sleep(1);
  }

  if (me == 0) {
    printf("\nAggregate put/get requests\n\n");
    fflush(stdout);
  }
  test_aggregate(1); /* cold start */
  test_aggregate(0); /* warm start */

  ARMCI_AllFence();
  ARMCI_Barrier();
  if (me == 0) {
    printf("\nSuccess!!\n");
    fflush(stdout);
  }
  sleep(2);

  ARMCI_Barrier();
  ARMCI_Finalize();
  armci_msg_finalize();
  return(0);
}
Example #8
0
/** Map process IDs onto a binary tree.
  *
  * @param[in]  scope Scope of processes involved
  * @param[out] root  Process id of the root
  * @param[out] up    Process id of my parent
  * @param[out] left  Process id of my left child
  * @param[out] right Process if of my right child
  */
void armci_msg_bintree(int scope, int *root, int *up, int *left, int *right) {
  int me, nproc;

  if (scope == SCOPE_NODE) {
    *root  = 0;
    *left  = -1;
    *right = -1;
   
    return;
  }

  me    = armci_msg_me();
  nproc = armci_msg_nproc();

  *root = 0;
  *up   =  (me == 0) ? -1 : (me - 1) / 2;

  *left = 2*me + 1;
  if (*left >= nproc) *left = -1;

  *right = 2*me + 2;
  if (*right >= nproc) *right = -1;
}
Example #9
0
void TRANSPOSE1D() {
    
    int dims[1];
    int nelem, i, ierr, min, max, cmin, cmax, lmin, lmax, pmin, pmax;    
    int src_offset, dst_offset, length;
    int *buf, *map;
    void *src_ptr, *dst_ptr;
    void **a_ptr, **b_ptr;
    int *a, *b;

    /* Find local processor ID and number of processors */
    int me, nprocs;
    me     = armci_msg_me();
    nprocs = armci_msg_nproc();

    /* Allocate pointers to data on all processors */
    a_ptr = (void**)malloc(nprocs*sizeof(int*));
    b_ptr = (void**)malloc(nprocs*sizeof(int*));
    map = (int*)malloc(nprocs*sizeof(int));

    /* Configure array dimensions. Force an unequal data distribution */
    dims[0]  = nprocs*TOTALELEMS + nprocs/2;
    if (me == 0) printf("Size of array: %d\n\n",dims[0]);
    /* Find first (zero-based) index of chunk owned by each processor and
       store it in map array */
    for (i=0; i<nprocs; i++) {
      map[i] = (int)(((double)i)*(((double)dims[0])/((double)nprocs)));
    }

    /* Figure out what size my portion of array is */
    if (me<nprocs-1) {
      nelem = map[me+1]-map[me];
    } else {
      nelem = dims[0]-map[me];
    }

    /* Allocate memory for array A */
    ierr = ARMCI_Malloc(a_ptr, nelem*sizeof(int));
    assert(ierr == 0);
    assert(a_ptr[me]);

    /* Allocate memory for array B */
    ierr = ARMCI_Malloc(b_ptr, nelem*sizeof(int));
    assert(ierr == 0);
    assert(b_ptr[me]);
    
    /* initialize data in array A and zero data in array B */
    a = (int*)a_ptr[me];
    b = (int*)b_ptr[me];
    for (i=0; i<nelem; i++) {
      a[i] = i + map[me] + 1;
      b[i] = 0;
    }

    /* Synchronize all processors to guarantee that everyone has data
       before proceeding to the next step. */
    armci_msg_barrier();

    /* Create local buffer for performing inversion */
    buf = (int*)malloc(nelem*sizeof(int));

    /* Copy inverted data into local buffer */
    a = (int*)a_ptr[me];
    for (i=0; i<nelem; i++) {
      buf[i] = a[nelem-i-1]; 
    }

    /* Find out which blocks of array B inverted block should be copied to.
       Start by finding min and max indices of data in array B*/
    min = dims[0] - (map[me] + nelem);
    max = dims[0] - map[me] - 1;

    /* Locate processors containing the endpoints */
    pmin = 0;
    for (i=0; i<nprocs; i++) {
      if (min >= map[i]) {
        pmin = i;
      } else {
        break;
      }
    }
    pmax = nprocs-1;
    for (i=nprocs-2; i>=0; i--) {
      if (max < map[i+1]) {
        pmax = i;
      } else {
        break;
      }
    }

    /* Loop over processors that will receive data and copy inverted data to
       processors */
    for (i=pmin; i<=pmax; i++) {
      /* Find min and max indices owned by processor i */
      lmin = map[i];
      if (i<nprocs-1) {
        lmax = map[i+1]-1;
      } else {
        lmax = dims[0]-1;
      }

      /* Find min and max indices that should be sent to processor i */
      if (lmin > min) {
        cmin = lmin;
      } else {
        cmin = min;
      }
      if (lmax < max) {
        cmax = lmax;
      } else {
        cmax = max;
      }

      /* Find offsets on source and destination processors */
      src_offset = cmin - min;
      src_ptr = (void*)(buf + src_offset);
      dst_offset = cmin - lmin;
      dst_ptr = ((char*)b_ptr[i]) + sizeof(int)*dst_offset;
      
      /* Find length of data (in bytes) to be sent to processor i */
      length = sizeof(int)*(cmax-cmin+1);

      /* Send data to processor */
      ARMCI_Put(src_ptr, dst_ptr, length, i);
    }
    ARMCI_AllFence();
    armci_msg_barrier();
    
    free(buf);

    VERIFY(b_ptr, dims, map);

    free(map);
    armci_msg_barrier();
    ARMCI_Free(a_ptr[me]);
    ARMCI_Free(b_ptr[me]);
    free(a_ptr);
    free(b_ptr);
}
Example #10
0
int main(int argc, char *argv[])
{
  int ch;
  extern char *optarg;
  int i, j, r;
  thread_t threads[MAX_TPP];

  /* init ARMCI */
  ARMCI_Init_args(&argc, &argv);
  size = armci_msg_nproc();
  rank = armci_msg_me();

  while ((ch = getopt(argc, argv, "t:s:i:d:h")) != -1) {
    switch (ch) {
      case 't': /* # of threads */
        tpp = atoi(optarg);
        if (tpp < 1 || tpp > MAX_TPP) {
          PRINTF0("\"%s\" is improper value for -t, should be a "
                  "number between 1 and %d(MAX_TPP)\n",
                  optarg, MAX_TPP);
          usage();
        }
        break;
      case 'i': /* # of iterations */
        iters = atoi(optarg);
        if (iters < 1) {
          PRINTF0("\"%s\" is improper value for -t, should be a "
                  "number equal or larger than 1\n", optarg);
          usage();
        }
        break;
      case 's': /* # of elements in the array */
        asize = atoi(optarg);
        if (iters < 1) {
          PRINTF0("\"%s\" is improper value for -s, should be a "
                  "number equal or larger than 1\n", optarg);
          usage();
        }
        break;
      case 'd':
        delay = atoi(optarg);
        break; /* delay before start */
      case 'h':
        usage();
        break; /* print usage info */
    }
  }
#ifdef NOTHREADS
  tpp = 1;
  PRINTF0("Warning: NOTHREADS debug symbol is set -- running w/o threads\n");
#endif
  th_size = size * tpp;
  PRINTF0("\nTest of multi-threaded capabilities:\n"
          "%d threads per process (%d threads total),\n"
          "%d array elements of size %d,\n"
          "%d iteration(s)\n\n", tpp, th_size, asize, sizeof(atype_t), iters);
  if (delay) {
    printf("%d: %d\n", rank, getpid());
    fflush(stdout);
    sleep(delay);
    ARMCI_Barrier();
  }
  TH_INIT(size, tpp);
  for (i = 0; i < tpp; i++) {
    th_rank[i] = rank * tpp + i;
  }

#if defined(DEBUG) && defined(LOG2FILE)
  for (i = 0; i < tpp; i++) {
    fname[10] = '0' + th_rank[i] / 100;
    fname[11] = '0' + th_rank[i] % 100 / 10;
    fname[12] = '0' + th_rank[i] % 10;
    dbg[i] = fopen(fname, "w");
  }
#endif
  for (i = 0; i < tpp; i++) {
    prndbg(i, "proc %d, thread %d(%d):\n", rank, i, th_rank[i]);
  }

  /* set global seed (to ensure same random sequence across procs) */
  time_seed = (unsigned)time(NULL);
  armci_msg_brdcst(&time_seed, sizeof(time_seed), 0);
  srand(time_seed);
  rand();
  prndbg(0, "seed = %u\n", time_seed);
  /* random pairs */
  pairs = calloc(th_size, sizeof(int));
  for (i = 0; i < th_size; i++) {
    pairs[i] = -1;
  }
  for (i = 0; i < th_size; i++) {
    if (pairs[i] != -1) {
      continue;
    }
    r = RND(0, th_size);
    while (i == r || pairs[r] != -1) {
      r = RND(0, th_size);
    }
    pairs[i] = r;
    pairs[r] = i;
  }
  for (i = 0, cbufl = 0; i < th_size; i++)
    cbufl += sprintf(cbuf + cbufl, " %d->%d|%d->%d",
                     i, pairs[i], pairs[i], pairs[pairs[i]]);
  prndbg(0, "random pairs:%s\n", cbuf);
  /* random targets */
  rnd_tgts = calloc(th_size, sizeof(int));
  for (i = 0, cbufl = 0; i < th_size; i++) {
    rnd_tgts[i] = RND(0, th_size);
    if (rnd_tgts[i] == i) {
      i--;
      continue;
    }
    cbufl += sprintf(cbuf + cbufl, " %d", rnd_tgts[i]);
  }
  prndbg(0, "random targets:%s\n", cbuf);
  /* random one */
  rnd_one = RND(0, th_size);
  prndbg(0, "random one = %d\n", rnd_one);

  assert(ptrs1 = calloc(th_size, sizeof(void *)));
  assert(ptrs2 = calloc(th_size, sizeof(void *)));
#ifdef NOTHREADS
  thread_main((void *)(long)0);
#else
  for (i = 0; i < tpp; i++) {
    THREAD_CREATE(threads + i, thread_main, (void *)(long)i);
  }
  for (i = 0; i < tpp; i++) {
    THREAD_JOIN(threads[i], NULL);
  }
#endif

  ARMCI_Barrier();
  PRINTF0("Tests Completed\n");

  /* clean up */
#if defined(DEBUG) && defined(LOG2FILE)
  for (i = 0; i < tpp; i++) {
    fclose(dbg[i]);
  }
#endif
  ARMCI_Finalize();
  TH_FINALIZE();
  armci_msg_finalize();

  return 0;
}
Example #11
0
File: lu.c Project: jeffhammond/ga
int main(int argc, char *argv[])
{
    int i, j;
    int ch;
    int edge;
    int size;
    int nloop=5;
    double **ptr_loc;
    
    armci_msg_init(&argc,&argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();
    
    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
            case 'n': n = atoi(optarg); break;
            case 'b': block_size = atoi(optarg); break;
            case 'p': nproc = atoi(optarg); break;
            case 'h': {
                printf("Usage: LU, or \n");
        printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
                armci_msg_barrier();
                armci_msg_finalize();
                exit(0);
            }            
        }
    }
    
    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }
    
    num_rows = (int) sqrt((double) nproc);
    for (;;) {
        num_cols = nproc/num_rows;
        if (num_rows*num_cols == nproc)
            break;
        num_rows--;
    }
    
    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }
    
    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }
    
#ifdef DEBUG
    if(me == 0)
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) 
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    armci_msg_barrier();
    armci_msg_finalize();
    exit(0);
#endif
    
    for (i=0;i<nblocks;i++) {
        for (j=0;j<nblocks;j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }
    
    ptr = (void **)malloc(nproc * sizeof(void *));
#ifdef MPI2_ONESIDED
    MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]);
    MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL,
                   MPI_COMM_WORLD, &win);
    for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me];
    MPI_Barrier(MPI_COMM_WORLD);
    
#else
    /* initialize ARMCI */
    ARMCI_Init();
    ARMCI_Malloc(ptr, proc_bytes);
#endif
    
    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    } 
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks;i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }
    
    /* initialize the array */
    init_array();
    
    /* barrier to ensure all initialization is done */
    armci_msg_barrier();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    armci_msg_barrier();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me); 
        }
        armci_msg_barrier();
    }
    
    lu(n, block_size, me); /* cold start */

    /* Starting the timer */

    armci_msg_barrier();
    if(me == 0) start_timer();
    for(i=0; i<nloop; i++) lu(n, block_size, me);    
    armci_msg_barrier();

    /* Timer Stops here */
    if(me == 0) 
        printf("\nRunning time = %f milliseconds.\n\n",  elapsed_time()/nloop);
    printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop);
    
    if(doprint) {        
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        armci_msg_barrier();
    }
    
    /* done */
#ifdef MPI2_ONESIDED
    MPI_Win_free(&win);
    MPI_Free_mem(ptr[me]);
#else
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
#endif
    armci_msg_finalize();

    return 0;
}
Example #12
0
int main(int argc, char *argv[])
{
  int rc, i, j = 0, rid, ret;
  armci_ckpt_ds_t ckptds;
  ARMCI_Group grp;

  ARMCI_Init_args(&argc, &argv);
  nproc = armci_msg_nproc();
  me = armci_msg_me();

  if (me == 0) {
    if (nproc > MAXPROCS) {
      ARMCI_Error("nproc > MAXPROCS", nproc);
    }
    else {
      printf("ARMCI test program (%d processes)\n", nproc);
      fflush(stdout);
      sleep(1);
    }

  }
  armci_init_checkpoint2();
  ARMCI_Group_get_world(&grp);
  size = SIZE_;
  rc = ARMCI_Malloc((void **)ptr_arr, size * 8);
  printf("ARMCI test program (%d processes)\n", nproc);
  fflush(stdout);
  for (size = 1; size <= SIZE_; size *= 2) {
    t1 = MPI_Wtime();
    for (i = 0; i < 5; i++) {
      for (rc = 0; rc < 15; rc++) {
        do_work(size);
      }
    }
    time_array[j++] = MPI_Wtime() - t1;
    ARMCI_Barrier();
    printf("%d:done for size %ld\n", me, size);
    fflush(stdout);
  }

  (void)ARMCI_Ckpt_create_ds(&ckptds, 1);
  ckptds.ptr_arr[0] = ptr_arr[me];
  ckptds.sz[0] = SIZE_ * 8;
  rid = ARMCI_Ckpt_init(NULL, &grp, 1, 0, &ckptds);
  printf("%d: After ARMCI_Ckpt_init(): \n", me);

  j = 0;
  for (size = 128; size <= SIZE_; size *= 2) {

    int rc;
    int simulate_restart = 1;
    t1 = MPI_Wtime();

    ret = ARMCI_Ckpt(rid);
    if (ret == ARMCI_CKPT) {
      printf("%d: Performed CHECKPOINT @ size=%ld\n", me, size);
    }
    else if (ret == ARMCI_RESTART) {
      simulate_restart = 0;
      printf("%d: Performed RESTART @ size=%ld\n", me, size);
    }

    for (i = 0; i < 5; i++) {
      for (rc = 0; rc < 15; rc++)
        if (i == 3 && rc == 10) {
        }
      do_work(size);
    }

    time_array1[j++] = MPI_Wtime() - t1;
    sleep(1);

    if (simulate_restart && size == FAILURE_SIZE_) {
      printf("%d: Simulating FAILURE @ size = %d\n", me, size);
      ARMCI_Restart_simulate(rid, 1);
    }

    printf("%d: DONE for size=%ld regular=%f withckpt=%f\n\n",
           me, size, time_array[j-1], time_array1[j-1]);
    fflush(stdout);

  }

  ARMCI_Ckpt_finalize(rid);

  printf("Before Finalize()\n");
  ARMCI_Barrier();
  ARMCI_Finalize();
  armci_msg_finalize();
  return(0);
}
int main(int argc, char *argv[])
{
    int i, j;
    int ch;
    int edge;
    int size;

    /* ARMCI */
    void **ptr;
    double **ptr_loc;

    armci_msg_init(&argc,&argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();

    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
        case 'n':
            n = atoi(optarg);
            break;
        case 'b':
            block_size = atoi(optarg);
            break;
        case 'p':
            nproc = atoi(optarg);
            break;
        case 'h': {
            printf("Usage: LU, or \n");
            printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
            armci_msg_barrier();
            armci_msg_finalize();
            exit(0);
        }
        }
    }

    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }

    /*      num_rows = (int) sqrt((double) nproc); */
    /*      for (;;) { */
    /*          num_cols = nproc/num_rows; */
    /*          if (num_rows*num_cols == nproc) */
    /*              break; */
    /*          num_rows--; */
    /*      } */

    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }

    nnodes = nproc / 4;
    if((nnodes * 4) != nproc) {
        num_cols = nproc - nnodes * 4;
        nnodes++;
        num_rows = 1;
    }
    else {
        num_cols = 2;
        num_rows = 2;
    }

    num = (nblocks * nblocks)/nnodes;
    if((num * nnodes) != (nblocks * nblocks))
        num++;

#ifdef DEBUG
    if(me == 0)
        for (i=0; i<nblocks; i++) {
            for (j=0; j<nblocks; j++)
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    armci_msg_barrier();
    armci_msg_finalize();
    exit(0);
#endif

    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }

    for (i=0; i<nblocks; i++) {
        for (j=0; j<nblocks; j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }

    /* initialize ARMCI */
    ARMCI_Init_args(&argc, &argv);
    ptr = (void **)malloc(nproc * sizeof(void *));
    ARMCI_Malloc(ptr, proc_bytes);

    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    }
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks; i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }

    /* initialize the array */
    init_array();

    /* barrier to ensure all initialization is done */
    armci_msg_barrier();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    armci_msg_barrier();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me);
        }
        armci_msg_barrier();
    }


    /* Starting the timer */
    if(me == 0) start_timer();

    lu(n, block_size, me);

    armci_msg_barrier();

    /* Timer Stops here */
    if(me == 0)
        printf("\nRunning time = %f milliseconds.\n\n",  elapsed_time());

    if(doprint) {
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        armci_msg_barrier();
    }

    /* done */
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
    armci_msg_finalize();

    return 0;
}
Example #14
0
main(int argc, char *argv[])
{
    int i, j, l;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    int lu_arg[MAX_THREADS][3];
    
    /* ARMCI */
    void **ptr;
    double **ptr_loc;

    THREAD_LOCK_INIT(mutex);
    
    armci_msg_init(&argc,&argv);
    nproc = armci_msg_nproc();
    me = armci_msg_me();
    
    while ((ch = getopt(argc, argv, "n:b:p:t:d:h")) != -1) {
        switch(ch) {
            case 'n': n = atoi(optarg); break;
            case 'b': block_size = atoi(optarg); break;
            case 'p': nproc = atoi(optarg); break;
            case 't': th_per_p = atoi(optarg); break;
            case 'd': d = atoi(optarg); break;
            case 'h': {
                printf("Usage: LU, or \n");
        printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC -tTH_PER_P\n");
                armci_msg_barrier();
                armci_msg_finalize();
                exit(0);
            } 
        }
    }

    if(th_per_p>MAX_THREADS) {
        th_per_p=MAX_THREADS;
        if(me==0)printf("Warning: cannot run more than %d threads, adjust MAX_THREADS",MAX_THREADS);
    }

    if (d) {
        fprintf(stderr, "%d: %d\n", me, getpid());
        sleep(d);
    }

    nthreads = th_per_p * nproc;
    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d thread(s) per processor, %d threads total\n", th_per_p, nthreads);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }
    
    num_rows = (int) sqrt((double) nthreads);
    for (;;) {
        num_cols = nthreads/num_rows;
        if (num_rows*num_cols == nthreads)
            break;
        num_rows--;
    }
    
    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }
    
    num = (nblocks * nblocks)/nthreads;
    if((num * nthreads) != (nblocks * nblocks))
        num++;

    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }
#ifdef DEBUG
    if(me == 0)
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) 
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    armci_msg_barrier();
/*    armci_msg_finalize(); */
/*    exit(0); */
#endif
    
    for (l = 0; l < th_per_p; l++) {
        me_th[l] = me * th_per_p + l;
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) {
                if(block_owner(i,j) == me_th[l]) {
                    if ((i == nblocks-1) && (j == nblocks-1)) {
                        size = edge*edge;
                    }
                    else if ((i == nblocks-1) || (j == nblocks-1)) {
                        size = edge*block_size;
                    }
                    else {
                        size = block_size*block_size;
                    }
                    thread_doubles[l] += size;
                }
            }
        }
        proc_bytes += thread_doubles[l] * sizeof(double);
    }

    /* initialize ARMCI */
    ARMCI_Init();
    ptr = (void **)malloc(nproc * sizeof(void *));
    ARMCI_Malloc(ptr, proc_bytes);

    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    }
    ptr_loc = (double **)malloc(nthreads*sizeof(double *));
    for (i = 0; i < nproc; i++) {
        ptr_loc[i * th_per_p] = (double *)ptr[i];
        for (j = 1; j < th_per_p; j++)
            ptr_loc[i * th_per_p + j] = ptr_loc[i * th_per_p + j - 1] + thread_doubles[j - 1];
    }
    for(i=0; i<nblocks;i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }
#if 0
    for(i=0; i<nblocks*nblocks;i ++) printf("%d: a[%d]=%p\n", me, i, a[i]);
    fflush(stdout);
#endif
    
    /* initialize the array */
    init_array();
    
    /* barrier to ensure all initialization is done */
    armci_msg_barrier();

    /* to remove cold-start misses, all processors touch their own data */
/*    for (l = 0; l < th_per_p; l++) touch_array(block_size, me_th[l]); */
    armci_msg_barrier();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me); 
        }
        armci_msg_barrier();
    }

#if 1
    for (i = 0; i < nblocks; i++)
        for (j = 0; j < nblocks; j++)
            print_block_dbg(a[i + j * nblocks], "proc %d, a[%d, %d]:\n", me, i, j);
#endif

    TH_INIT(nproc,th_per_p);

    /* Starting the timer */
    if(me == 0) start_timer();

    for (l = 0; l < th_per_p; l++) {
        lu_arg[l][0] = n;
        lu_arg[l][1] = block_size;
        lu_arg[l][2] = l;
        THREAD_CREATE(threads + l, lu, lu_arg[l]);
    }
    
    for (l = 0; l < th_per_p; l++) THREAD_JOIN(threads[l], NULL);
    armci_msg_barrier();

    /* Timer Stops here */
    if(me == 0) 
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time());

    if(doprint) {        
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        armci_msg_barrier();
    }
    
    /* done */
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
    armci_msg_finalize();

    THREAD_LOCK_DESTROY(mutex);
}