Example #1
0
int main(int argc, char *argv[])
{
    int i, j;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    
    /* ARMCI */
    void **ptr;
    double **ptr_loc;
    
    MP_INIT(argc,argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);
    
    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
            case 'n': n = atoi(optarg); break;
            case 'b': block_size = atoi(optarg); break;
            case 'p': nproc = atoi(optarg); break;
            case 'h': {
                printf("Usage: LU, or \n");
        printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
                MP_BARRIER();
                MP_FINALIZE();
                exit(0);
            }            
        }
    }
    
    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }
    
/*      num_rows = (int) sqrt((double) nproc); */
/*      for (;;) { */
/*          num_cols = nproc/num_rows; */
/*          if (num_rows*num_cols == nproc) */
/*              break; */
/*          num_rows--; */
/*      } */
    
    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }

    nnodes = nproc / 4;
    if((nnodes * 4) != nproc) {
        num_cols = nproc - nnodes * 4;
        nnodes++;
        num_rows = 1;
    }
    else {
        num_cols = 2;
        num_rows = 2;
    }    
    
    num = (nblocks * nblocks)/nnodes;
    if((num * nnodes) != (nblocks * nblocks))
        num++;

#ifdef DEBUG
    if(me == 0)
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) 
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    MP_BARRIER();
    MP_FINALIZE();
    exit(0);
#endif
    
    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }
    
    for (i=0;i<nblocks;i++) {
        for (j=0;j<nblocks;j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }
    
    /* initialize ARMCI */
    ARMCI_Init();
    ptr = (void **)malloc(nproc * sizeof(void *));
    ARMCI_Malloc(ptr, proc_bytes);
    
    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    } 
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks;i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }
    
    /* initialize the array */
    init_array();
    
    /* barrier to ensure all initialization is done */
    MP_BARRIER();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    MP_BARRIER();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me); 
        }
        MP_BARRIER();
    }
    

    /* Starting the timer */
    if(me == 0) start_timer();

    lu(n, block_size, me);
    
    MP_BARRIER();

    /* Timer Stops here */
    if(me == 0) 
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time());

    if(doprint) {        
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        MP_BARRIER();
    }
    
    /* done */
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
    MP_FINALIZE();

    return 0;
}
Example #2
0
main(int argc, char *argv[])
{
    int i, j;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    int nloop=5;
    double **ptr_loc;

    MP_INIT(arc,argv);
    MP_PROCS(&nproc);
    MP_MYID(&me);

    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
        case 'n':
            n = atoi(optarg);
            break;
        case 'b':
            block_size = atoi(optarg);
            break;
        case 'p':
            nproc = atoi(optarg);
            break;
        case 'h': {
            printf("Usage: LU, or \n");
            printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
            MP_BARRIER();
            MP_FINALIZE();
            exit(0);
        }
        }
    }

    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }

    num_rows = (int) sqrt((double) nproc);
    for (;;) {
        num_cols = nproc/num_rows;
        if (num_rows*num_cols == nproc)
            break;
        num_rows--;
    }

    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }

    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }

#ifdef DEBUG
    if(me == 0)
        for (i=0; i<nblocks; i++) {
            for (j=0; j<nblocks; j++)
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    MP_BARRIER();
    MP_FINALIZE();
    exit(0);
#endif

    for (i=0; i<nblocks; i++) {
        for (j=0; j<nblocks; j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }

    ptr = (void **)malloc(nproc * sizeof(void *));
#ifdef MPI2_ONESIDED
    MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]);
    MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL,
                   MPI_COMM_WORLD, &win);
    for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me];
    MPI_Barrier(MPI_COMM_WORLD);

#else
    /* initialize ARMCI */
    ARMCI_Init();
    ARMCI_Malloc(ptr, proc_bytes);
#endif

    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    }
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks; i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }

    /* initialize the array */
    init_array();

    /* barrier to ensure all initialization is done */
    MP_BARRIER();

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    MP_BARRIER();

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me);
        }
        MP_BARRIER();
    }

    lu(n, block_size, me); /* cold start */

    /* Starting the timer */

    MP_BARRIER();
    if(me == 0) start_timer();
    for(i=0; i<nloop; i++) lu(n, block_size, me);
    MP_BARRIER();

    /* Timer Stops here */
    if(me == 0)
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time()/nloop);
    printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop);

    if(doprint) {
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        MP_BARRIER();
    }

    /* done */
#ifdef MPI2_ONESIDED
    MPI_Win_free(&win);
    MPI_Free_mem(ptr[me]);
#else
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
#endif
    MP_FINALIZE();
}
Example #3
0
main(int argc, char *argv[])
{
  int i, j;
  int ch;
  extern char *optarg;
  int edge;
  int size;
    
  /* ARMCI */
  void **ptr;
  double **ptr_loc;
  void **bufr_g, **bufc_g;

  MP_INIT(arc,argv);
  MP_PROCS(&nproc);
  MP_MYID(&me);
    
  while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
    switch(ch) {
    case 'n': n = atoi(optarg); break;
    case 'b': block_size = atoi(optarg); break;
    case 'p': nproc = atoi(optarg); break;
    case 'h': {
      printf("Usage: LU, or \n");
      printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
      MP_BARRIER();
      MP_FINALIZE();
      exit(0);
    }            
    }
  }
    
  if(me == 0) {
    printf("\nUsing pre-PUTing\n");
    printf("\n Blocked Dense LU Factorization\n");
    printf("     %d by %d Matrix\n", n, n);
    printf("     %d Processors\n", nproc);
    printf("     %d by %d Element Blocks\n", block_size, block_size);
    printf("\n");
  }
    
  num_rows = (int) sqrt((double) nproc);
  for (;;) {
    num_cols = nproc/num_rows;
    if (num_rows*num_cols == nproc)
      break;
    num_rows--;
  }
    
  nblocks = n/block_size;
  if (block_size * nblocks != n) {
    nblocks++;
  }
    
  edge = n%block_size;
  if (edge == 0) {
    edge = block_size;
  }
    
  #ifdef DEBUG
  if(me == 0)
    for (i=0;i<nblocks;i++) {
      for (j=0;j<nblocks;j++) 
	printf("%d ", block_owner(i, j));
      printf("\n");
    }
  MP_BARRIER();
  MP_FINALIZE();
  exit(0);
  #endif
    
  for (i=0;i<nblocks;i++) {
    for (j=0;j<nblocks;j++) {
      if(block_owner(i,j) == me) {
	if ((i == nblocks-1) && (j == nblocks-1)) {
	  size = edge*edge;
	}
	else if ((i == nblocks-1) || (j == nblocks-1)) {
	  size = edge*block_size;
	}
	else {
	  size = block_size*block_size;
	}
	proc_bytes += size*sizeof(double);
      }
    }
  }
    
  /* initialize ARMCI */
  ARMCI_Init();
  ptr = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(ptr, proc_bytes);
  
  a = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *));
  if (a == NULL) {
    fprintf(stderr, "Could not malloc memory for a\n");
    exit(-1);
  } 
  ptr_loc = (double **)ARMCI_Malloc_local(nproc*sizeof(double *));
  for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
  for(i=0; i<nblocks;i ++) {
    for(j=0; j<nblocks; j++) {
      a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
      if ((i == nblocks-1) && (j == nblocks-1)) {
	size = edge*edge;
      } else if ((i == nblocks-1) || (j == nblocks-1)) {
	size = edge*block_size;
      } else {
	size = block_size*block_size;
      }
      ptr_loc[block_owner(i, j)] += size;
    }
  }
    
  /* initialize the array */
  init_array();
  
  bufr = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *));
  bufc = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *));

  if (bufr == NULL || bufc == NULL)
    printf("Could not ARMCI_Malloc_local() mem\n");
  /* bufr points to all k-th row blocks */
  /* save all block address in row-major order */
  proc_bytes = nblocks*block_size*block_size * sizeof(double);
  bufr_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(bufr_g, proc_bytes);

  for (i = 0; i < nproc; i++) {
    bufr[i*nblocks] = (double *) bufr_g[i];
    for (j = 1; j < nblocks; j++) {
      bufr[i*nblocks + j]  = bufr[i*nblocks + j-1] + block_size * block_size;
    }
  }

  /* bufc points to all k-th column blocks */
  bufc_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(bufc_g, proc_bytes);

  for (i = 0; i < nproc; i++) {
    bufc[i*nblocks] = (double *) bufc_g[i];
    for (j = 1; j < nblocks; j++) {
      bufc[i*nblocks + j]  = bufc[i*nblocks + j-1] + block_size * block_size;
    }
  }

  /* barrier to ensure all initialization is done */
  MP_BARRIER();

  /* to remove cold-start misses, all processors touch their own data */
  touch_array(block_size, me);
  MP_BARRIER();

  if(doprint) {
    if(me == 0) {
      printf("Matrix before LU decomposition\n");
      print_array(me); 
    }
    MP_BARRIER();
  }  

  /* Starting the timer */
  if(me == 0) start_timer();

  lu(n, block_size, me);
  
  MP_BARRIER();

  /* Timer Stops here */
  if(me == 0) 
  printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time());

  if(doprint) {        
    if(me == 0) {
      printf("after LU\n");
      print_array(me);
    }
    MP_BARRIER();
  }
    
  /* done */
  ARMCI_Free(ptr[me]);
  ARMCI_Free(bufc_g[me]);
  ARMCI_Free(bufr_g[me]);
  ARMCI_Finalize();
  MP_FINALIZE();
}