Ejemplo n.º 1
0
void print_array(int myid)
{
  int i, j, k;
  double **buf;

  int ii, jj;
  int edge;
  int ibs, jbs, skip;

  buf = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *));

  for(i=0; i<nblocks; i++) 
    for(j=0; j<nblocks; j++) 
      if(block_owner(i, j) == myid)
	buf[i+j*nblocks] = a[i+j*nblocks];
      else {
	buf[i+j*nblocks] = (double *)ARMCI_Malloc_local(block_size*block_size*
					    sizeof(double));
	get_remote(buf[i+j*nblocks], i, j);
      }

  /* copied from lu.C */
  edge = n%block_size;
  for (i=0; i<n; i++) {
    for (j=0; j<n; j++) {
      if ((n - i) <= edge) {
	ibs = edge;
	ibs = n-edge;
	skip = edge;
      } else {
	ibs = block_size;
	skip = block_size;
      }
      if ((n - j) <= edge) {
	jbs = edge;
	jbs = n-edge;
      } else {
	jbs = block_size;
      }
      ii = (i/block_size) + (j/block_size)*nblocks;
      jj = (i%ibs)+(j%jbs)*skip;
      printf("%8.1f ", buf[ii][jj]);   
    }
    printf("\n");
  }
  fflush(stdout);      
    
  for(i=0; i<nblocks; i++) 
    for(j=0; j<nblocks; j++) 
      if(block_owner(i, j) != myid) ARMCI_Free_local(buf[i+j*nblocks]);
  
  ARMCI_Free_local(buf);
}
Ejemplo n.º 2
0
int main(int argc, char * argv[]) {
  void *baseAddress[MAX_PROCESSORS];
  char *local;
  int thisImage;

  int iter = 100, size;
  double startTime, endTime;
  int i;

  // initialize
  ARMCI_Init();
  ARMCI_Myid(&thisImage);

  // allocate data (collective operation)
  ARMCI_Malloc(baseAddress, MAX_BUF_SIZE*sizeof(char));
  local = (char *)ARMCI_Malloc_local(MAX_BUF_SIZE*sizeof(char));

  ARMCI_Barrier();
  ARMCI_Migrate();

  if (thisImage == 0) {
    for(size = 1; size <= MAX_BUF_SIZE; size = size<<1){
      startTime = CkWallTimer();
      for(i = 0; i < iter; i++){
        ARMCI_Put(local, baseAddress[1], size, 1);
      }
      ARMCI_Fence(1);
      endTime = CkWallTimer();
      printf("%d: %f us\n", size, (endTime-startTime)*1000);
    }
    ARMCI_Barrier();
  } else if (thisImage == 1) {
    ARMCI_Barrier();
  }

  
  ARMCI_Free(baseAddress[thisImage]);
  ARMCI_Free_local(local);
  // finalize
  ARMCI_Finalize();
  return 0;
}
Ejemplo n.º 3
0
int main(int argc, char **argv) {
    int i, j, rank, nranks, peer, bufsize, errors;
    double **buffer, *src_buf;
    int count[2], src_stride, trg_stride, stride_level;

    MPI_Init(&argc, &argv);
    ARMCI_Init();

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    buffer = (double **) malloc(sizeof(double *) * nranks);

    bufsize = XDIM * YDIM * sizeof(double);
    ARMCI_Malloc((void **) buffer, bufsize);
    src_buf = ARMCI_Malloc_local(bufsize);

    if (rank == 0)
        printf("ARMCI Strided Put Test:\n");

    src_stride = XDIM * sizeof(double);
    trg_stride = XDIM * sizeof(double);
    stride_level = 1;

    count[1] = YDIM;
    count[0] = XDIM * sizeof(double);

    ARMCI_Barrier();

    peer = (rank+1) % nranks;

    for (i = 0; i < ITERATIONS; i++) {

      for (j = 0; j < XDIM*YDIM; j++) {
        *(src_buf + j) = rank + i;
      }

      ARMCI_PutS(
          src_buf,
          &src_stride,
          (void *) buffer[peer],
          &trg_stride,
          count,
          stride_level,
          peer);
    }

    ARMCI_Barrier();

    ARMCI_Access_begin(buffer[rank]);
    for (i = errors = 0; i < XDIM; i++) {
      for (j = 0; j < YDIM; j++) {
        const double actual   = *(buffer[rank] + i + j*XDIM);
        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS);
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);
          errors++;
          fflush(stdout);
        }
      }
    }
    ARMCI_Access_end(buffer[rank]);

    ARMCI_Free((void *) buffer[rank]);
    ARMCI_Free_local(src_buf);
    free(buffer);

    ARMCI_Finalize();
    MPI_Finalize();

    if (errors == 0) {
      printf("%d: Success\n", rank);
      return 0;
    } else {
      printf("%d: Fail\n", rank);
      return 1;
    }
}
Ejemplo n.º 4
0
int main(int argc, char* argv[])
{
    int provided;
    int i, rank, nranks, msgsize, target;
    long bufsize;
    int **counter;
    int *complete;
    int increment;
    int counter_fetch;
    int counters_received;
    int t_start, t_stop, t_latency;
    int expected;

    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    ARMCI_Init_args(&argc, &argv);

    complete = (int *) malloc(sizeof(int) * COUNT);

    counter = (int**) ARMCI_Malloc_local( nranks * sizeof(int*) );
    ARMCI_Malloc((void *) counter[rank], sizeof(int));

    if (rank == 0)
    {
        printf("ARMCI_RMW Test - in usec \n");
        fflush(stdout);
    }

    target = 0; 

    for(i=0; i<COUNT; i++)
    {
       complete[i] = 0;
    } 
    if(rank == target) 
    { 
       *(counter[rank]) = 0;
    }
    increment = 1;
    counter_fetch = 0;
    counters_received = 0;

    MPI_Barrier(MPI_COMM_WORLD);
 
    while(counter_fetch < COUNT)
    {  
        ARMCI_Rmw(ARMCI_FETCH_AND_ADD,
                  (void *) &counter_fetch,
                  (void *) counter[target],
                  increment,
                  target);

        /* s/1/rank/ means we will know who got the counter */
        if (counter_fetch < COUNT) complete[counter_fetch] = rank;
        counters_received++;
    }

    MPI_Allreduce(MPI_IN_PLACE,complete,COUNT,MPI_INT,MPI_SUM,MPI_COMM_WORLD);

    for(i=0; i<COUNT; i++)
    {
       if (complete[i] == 0)
       {
           printf("[%d] The RMW update failed at index: %d \n", rank, i);
           fflush(stdout);
           exit(-1);
       }   
    }
    printf("[%d] The RMW update completed successfully \n", rank);
    fflush(stdout);
    MPI_Barrier(MPI_COMM_WORLD);

    if (0==rank)
    {
        printf("Checking for fairness...\n", rank);
        fflush(stdout);
        for(i=0; i<COUNT; i++)
        {
           printf("counter value %d was received by process %d\n", i, complete[i]);
        }
        fflush(stdout);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    printf("process %d received %d counters\n", rank, counters_received);
    fflush(stdout);

    ARMCI_Free(counter[rank]);
    ARMCI_Free_local(counter);

    ARMCI_Finalize();

    MPI_Finalize();

    return 0;
}
Ejemplo n.º 5
0
void test_2D()
{
    int i;
    int src, dst;
    int ierr;
    double *buf;
    void *ptr[MAXPROC], *get_ptr[MAXPROC];

    /* find who I am and the dst process */
    src = me;
    
#ifdef MALLOC_LOC
    if(me == 0) {
        buf = (double *)ARMCI_Malloc_local(SIZE * SIZE * sizeof(double));
        assert(buf != NULL);
    }
#else
    if(me == 0) {
        buf = (double *)malloc(SIZE * SIZE * sizeof(double));
        assert(buf != NULL);
    }
#endif

    ierr = ARMCI_Malloc(ptr, (SIZE * SIZE * sizeof(double)));
    assert(ierr == 0); assert(ptr[me]);
    ierr = ARMCI_Malloc(get_ptr, (SIZE * SIZE * sizeof(double)));
    assert(ierr == 0); assert(get_ptr[me]);
    
    /* ARMCI - initialize the data window */
    fill_array(ptr[me], SIZE*SIZE, me);
    fill_array(get_ptr[me], SIZE*SIZE, me);

    MP_BARRIER();
    
    /* only the proc 0 doest the work */
    /* print the title */
    if(me == 0) {
        if(!CHECK_RESULT){
           printf("  section               get                 put");
           printf("                 acc\n");
           printf("bytes   loop       sec      MB/s       sec      MB/s");
           printf("       sec      MB/s\n");
           printf("------- ------  --------  --------  --------  --------");
           printf("  --------  --------\n");
           fflush(stdout);
        }
        
        for(i=0; i<CHUNK_NUM; i++) {
            int loop;
            int bytes = chunk[i] * chunk[i] * sizeof(double);

            double t_get = 0, t_put = 0, t_acc = 0;
            double latency_get, latency_put, latency_acc;
            double bandwidth_get, bandwidth_put, bandwidth_acc;
            
            loop = SIZE / chunk[i];
            if(loop<2)loop=2;

            for(dst=1; dst<nproc; dst++) {
                /* strided get */
                fill_array(buf, SIZE*SIZE, me*10);
                t_get += time_get((double *)(get_ptr[dst]), (double *)buf,
                                 chunk[i], loop, dst, 1);
 
                /* strided put */
                fill_array(buf, SIZE*SIZE, me*10);
                t_put += time_put((double *)buf, (double *)(ptr[dst]),
                                 chunk[i], loop, dst, 1);
                
                /* strided acc */
                fill_array(buf, SIZE*SIZE, me*10);
                t_acc += time_acc((double *)buf, (double *)(ptr[dst]),
                                 chunk[i], loop, dst, 1);
            }
            
            latency_get = t_get/(nproc - 1);
            latency_put = t_put/(nproc - 1);
            latency_acc = t_acc/(nproc - 1);
            
            bandwidth_get = (bytes * (nproc - 1) * 1e-6)/t_get;
            bandwidth_put = (bytes * (nproc - 1) * 1e-6)/t_put;
            bandwidth_acc = (bytes * (nproc - 1) * 1e-6)/t_acc;

            /* print */
            if(!CHECK_RESULT)printf("%d\t%d\t%.2e  %.2e  %.2e  %.2e  %.2e  %.2e\n",
                       bytes, loop, latency_get, bandwidth_get,
                       latency_put, bandwidth_put, latency_acc, bandwidth_acc);
        }
    }
    else sleep(3);
    
    ARMCI_AllFence();
    MP_BARRIER();

    /* cleanup */
    ARMCI_Free(get_ptr[me]);
    ARMCI_Free(ptr[me]);

#ifdef MALLOC_LOC
    if(me == 0) ARMCI_Free_local(buf);
#else
    if(me == 0) free(buf);
#endif

}
Ejemplo n.º 6
0
int main(int argc, char **argv)
{

    int i, j, rank, nranks, peer;
    size_t xdim, ydim;
    unsigned long bufsize;
    double **buffer, *src_buf;
    double t_start=0.0, t_stop;
    int count[2], src_stride, trg_stride, stride_level;
    double scaling;
    int provided;

    MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    if (nranks < 2) {
        printf("%s: Must be run with at least 2 processes\n", argv[0]);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    ARMCI_Init_args(&argc, &argv);

    buffer = (double **) malloc(sizeof(double *) * nranks);

    bufsize = MAX_XDIM * MAX_YDIM * sizeof(double);
    ARMCI_Malloc((void **) buffer, bufsize);
    src_buf = ARMCI_Malloc_local(bufsize);

    if (rank == 0)
    {
        printf("ARMCI_AccS Latency - local and remote completions - in usec \n");
        printf("%30s %22s %22s\n",
               "Dimensions(array of double)",
               "Local Completion",
               "Remote completion");
        fflush(stdout);
    }

    ARMCI_Access_begin(buffer[rank]);
    for (i = 0; i < bufsize / sizeof(double); i++)
    {
      *(buffer[rank] + i) = 1.0 + rank;
      *(src_buf + i) = 1.0 + rank;
    }
    ARMCI_Access_end(buffer[rank]);

    scaling = 2.0;

    src_stride = MAX_YDIM * sizeof(double);
    trg_stride = MAX_YDIM * sizeof(double);
    stride_level = 1;

    ARMCI_Barrier();

    for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2)
    {

        count[1] = xdim;

        for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2)
        {

            count[0] = ydim * sizeof(double);

            if (rank == 0)
            {

                peer = 1;

                for (i = 0; i < ITERATIONS + SKIP; i++)
                {

                    if (i == SKIP) t_start = MPI_Wtime();

                    ARMCI_AccS(ARMCI_ACC_DBL,
                               (void *) &scaling,
                               /* (void *) buffer[rank] */ src_buf,
                               &src_stride,
                               (void *) buffer[peer],
                               &trg_stride,
                               count,
                               stride_level,
                               1);

                }
                t_stop = MPI_Wtime();
                ARMCI_Fence(1);

                char temp[10];
                sprintf(temp, "%dX%d", (int) xdim, (int) ydim);
                printf("%30s %20.2f ", temp, ((t_stop - t_start) * 1000000)
                        / ITERATIONS);
                fflush(stdout);

                ARMCI_Barrier();

                ARMCI_Barrier();

                for (i = 0; i < ITERATIONS + SKIP; i++)
                {

                    if (i == SKIP) t_start = MPI_Wtime();

                    ARMCI_AccS(ARMCI_ACC_DBL,
                               (void *) &scaling,
                               /* (void *) buffer[rank] */ src_buf,
                               &src_stride,
                               (void *) buffer[peer],
                               &trg_stride,
                               count,
                               stride_level,
                               1);
                    ARMCI_Fence(1);

                }
                t_stop = MPI_Wtime();
                printf("%20.2f \n", ((t_stop - t_start) * 1000000) / ITERATIONS);
                fflush(stdout);

                ARMCI_Barrier();

                ARMCI_Barrier();

            }
            else
            {

                peer = 0;

                ARMCI_Barrier();

                if (rank == 1) 
                {
                  ARMCI_Access_begin(buffer[rank]);
                  for (i = 0; i < xdim; i++)
                  {
                    for (j = 0; j < ydim; j++)
                    {
                      if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank)
                            + scaling * (1.0 + peer) * (ITERATIONS + SKIP)))
                      {
                        printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
                            i,
                            j,
                            ((1.0 + rank) + scaling * (1.0 + peer)),
                            *(buffer[rank] + i * MAX_YDIM + j));
                        fflush(stdout);
                        ARMCI_Error("Bailing out", 1);
                      }
                    }
                  }

                  for (i = 0; i < bufsize / sizeof(double); i++)
                  {
                    *(buffer[rank] + i) = 1.0 + rank;
                  }
                  ARMCI_Access_end(buffer[rank]);
                }

                ARMCI_Barrier();

                ARMCI_Barrier();

                if (rank == 1) 
                {
                  ARMCI_Access_begin(buffer[rank]);

                  for (i = 0; i < xdim; i++)
                  {
                    for (j = 0; j < ydim; j++)
                    {
                      if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank)
                            + scaling * (1.0 + peer) * (ITERATIONS + SKIP)))
                      {
                        printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
                            i,
                            j,
                            ((1.0 + rank) + scaling * (1.0 + peer)),
                            *(buffer[rank] + i * MAX_YDIM + j));
                        fflush(stdout);
                        ARMCI_Error("Bailing out", 1);
                      }
                    }
                  }

                  for (i = 0; i < bufsize / sizeof(double); i++)
                  {
                    *(buffer[rank] + i) = 1.0 + rank;
                  }

                  ARMCI_Access_end(buffer[rank]);
                }
                ARMCI_Barrier();

            }

        }

    }

    ARMCI_Barrier();

    ARMCI_Free((void *) buffer[rank]);
    ARMCI_Free_local(src_buf);
    free(buffer);

    ARMCI_Finalize();

    MPI_Finalize();

    return 0;
}
Ejemplo n.º 7
0
int main(int argc, char **argv)
{
	int me,nproc;
    int status;
    int rank;

    /* initialization */
    MPI_Init(&argc, &argv);
    ARMCI_Init();

#ifdef HPC_PROFILING
    HPM_Init();
#endif

    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    MPI_Comm_size(MPI_COMM_WORLD,&nproc);

#ifdef DEBUG
    if(me == 0){
       printf("The result of MPI_Comm_size is %d\n",nproc);
       fflush(stdout);
    }
#endif

    /* get the matrix parameters */
    if (argc > 1){
        rank = atoi(argv[1]);
    } else {
        rank = 8;
    }
    if (me == 0){
        printf("Running matmul.x with rank = %d\n",rank);
        fflush(stdout);
    }

    /* register remote pointers */
    double** addr_A = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc);
    if (addr_A == NULL) ARMCI_Error("malloc A failed at line",0);

    double** addr_B = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc);
    if (addr_B == NULL) ARMCI_Error("malloc B failed at line",0);

    double** addr_C = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc);
    if (addr_C == NULL) ARMCI_Error("malloc C failed at line",0);

#ifdef DEBUG
    if(me == 0) printf("ARMCI_Malloc A requests %lu bytes\n",rank*rank*sizeof(double));
    fflush(stdout);
#endif
    status = ARMCI_Malloc((void **) addr_A, rank*rank*sizeof(double));
    if (status != 0) ARMCI_Error("ARMCI_Malloc A failed",status);

#ifdef DEBUG
    if(me == 0) printf("ARMCI_Malloc B requests %lu bytes\n",rank*rank*sizeof(double));
    fflush(stdout);
#endif
    status = ARMCI_Malloc((void **) addr_B, rank*rank*sizeof(double));
    if (status != 0) ARMCI_Error("ARMCI_Malloc B failed",status);

#ifdef DEBUG
    if(me == 0) printf("ARMCI_Malloc C requests %lu bytes\n",rank*rank*sizeof(double));
    fflush(stdout);
#endif
    status = ARMCI_Malloc((void **) addr_C, rank*rank*sizeof(double));
    if (status != 0) ARMCI_Error("ARMCI_Malloc C failed",status);

    MPI_Barrier(MPI_COMM_WORLD);

    /* free ARMCI pointers */
    ARMCI_Free_local(addr_C);
    ARMCI_Free_local(addr_B);
    ARMCI_Free_local(addr_A);

#ifdef HPC_PROFILING
    HPM_Print();
#endif

    /* the end */
    ARMCI_Finalize();
    MPI_Finalize();

    return(0);
}
Ejemplo n.º 8
0
void TRANSPOSE1D() {
    
    int dims[1];
    int nelem, i, ierr, min, max, cmin, cmax, lmin, lmax, pmin, pmax;    
    int src_offset, dst_offset, length;
    int *buf, *map;
    void *src_ptr, *dst_ptr;
    void **a_ptr, **b_ptr;
    int *a, *b;

    /* Find local processor ID and number of processors */
    int me, nprocs;
    me     = armci_msg_me();
    nprocs = armci_msg_nproc();

    /* Allocate pointers to data on all processors */
    a_ptr = (void**)malloc(nprocs*sizeof(int*));
    b_ptr = (void**)malloc(nprocs*sizeof(int*));
    map = (int*)malloc(nprocs*sizeof(int));

    /* Configure array dimensions. Force an unequal data distribution */
    dims[0]  = nprocs*TOTALELEMS + nprocs/2;
    if (me == 0) printf("Size of array: %d\n\n",dims[0]);
    /* Find first (zero-based) index of chunk owned by each processor and
       store it in map array */
    for (i=0; i<nprocs; i++) {
      map[i] = (int)(((double)i)*(((double)dims[0])/((double)nprocs)));
    }

    /* Figure out what size my portion of array is */
    if (me<nprocs-1) {
      nelem = map[me+1]-map[me];
    } else {
      nelem = dims[0]-map[me];
    }

    /* Allocate memory for array A */
    ierr = ARMCI_Malloc(a_ptr, nelem*sizeof(int));
    assert(ierr == 0);
    assert(a_ptr[me]);

    /* Allocate memory for array B */
    ierr = ARMCI_Malloc(b_ptr, nelem*sizeof(int));
    assert(ierr == 0);
    assert(b_ptr[me]);
    
    /* initialize data in array A and zero data in array B */
    a = (int*)a_ptr[me];
    b = (int*)b_ptr[me];
    for (i=0; i<nelem; i++) {
      a[i] = i + map[me] + 1;
      b[i] = 0;
    }

    /* Synchronize all processors to guarantee that everyone has data
       before proceeding to the next step. */
    armci_msg_barrier();

    /* Create local buffer for performing inversion */
    buf = (int*)ARMCI_Malloc_local(nelem*sizeof(int));

    /* Copy inverted data into local buffer */
    a = (int*)a_ptr[me];
    for (i=0; i<nelem; i++) {
      buf[i] = a[nelem-i-1]; 
    }

    /* Find out which blocks of array B inverted block should be copied to.
       Start by finding min and max indices of data in array B*/
    min = dims[0] - (map[me] + nelem);
    max = dims[0] - map[me] - 1;

    /* Locate processors containing the endpoints */
    pmin = 0;
    for (i=0; i<nprocs; i++) {
      if (min >= map[i]) {
        pmin = i;
      } else {
        break;
      }
    }
    pmax = nprocs-1;
    for (i=nprocs-2; i>=0; i--) {
      if (max < map[i+1]) {
        pmax = i;
      } else {
        break;
      }
    }

    int loop = 4, k;
    int warmup = 2;

    double t_start, t_end;
    for (k = 0; k < loop +warmup; ++k) {
        
            if (warmup == k) {
                t_start = dclock();
            }

        /* Loop over processors that will receive data and copy inverted data to
           processors */
        for (i=pmin; i<=pmax; i++) {
            /* Find min and max indices owned by processor i */
            lmin = map[i];
            if (i<nprocs-1) {
                lmax = map[i+1]-1;
            } else {
                lmax = dims[0]-1;
            }

            /* Find min and max indices that should be sent to processor i */
            if (lmin > min) {
                cmin = lmin;
            } else {
                cmin = min;
            }
            if (lmax < max) {
                cmax = lmax;
            } else {
                cmax = max;
            }

            /* Find offsets on source and destination processors */
            src_offset = cmin - min;
            src_ptr = (void*)(buf + src_offset);
            dst_offset = cmin - lmin;
            dst_ptr = ((char*)b_ptr[i]) + sizeof(int)*dst_offset;

            /* Find length of data (in bytes) to be sent to processor i */
            length = sizeof(int)*(cmax-cmin+1);

            /* Send data to processor */
            ARMCI_Put(src_ptr, dst_ptr, length, i);
        }
        ARMCI_AllFence();
        armci_msg_barrier();
    }

    t_end = dclock();

    if (0 == me)
        printf("Procs = [%d], Time[%6.2f]ms\n", nprocs, (t_end - t_start)  / (loop * 1.0e3));
    ARMCI_Free_local(buf);

    VERIFY(b_ptr, dims, map);

    free(map);
    armci_msg_barrier();
    ARMCI_Free(a_ptr[me]);
    ARMCI_Free(b_ptr[me]);
    free(a_ptr);
    free(b_ptr);
}
Ejemplo n.º 9
0
main(int argc, char *argv[])
{
  int i, j;
  int ch;
  extern char *optarg;
  int edge;
  int size;
    
  /* ARMCI */
  void **ptr;
  double **ptr_loc;
  void **bufr_g, **bufc_g;

  MP_INIT(arc,argv);
  MP_PROCS(&nproc);
  MP_MYID(&me);
    
  while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
    switch(ch) {
    case 'n': n = atoi(optarg); break;
    case 'b': block_size = atoi(optarg); break;
    case 'p': nproc = atoi(optarg); break;
    case 'h': {
      printf("Usage: LU, or \n");
      printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
      MP_BARRIER();
      MP_FINALIZE();
      exit(0);
    }            
    }
  }
    
  if(me == 0) {
    printf("\nUsing pre-PUTing\n");
    printf("\n Blocked Dense LU Factorization\n");
    printf("     %d by %d Matrix\n", n, n);
    printf("     %d Processors\n", nproc);
    printf("     %d by %d Element Blocks\n", block_size, block_size);
    printf("\n");
  }
    
  num_rows = (int) sqrt((double) nproc);
  for (;;) {
    num_cols = nproc/num_rows;
    if (num_rows*num_cols == nproc)
      break;
    num_rows--;
  }
    
  nblocks = n/block_size;
  if (block_size * nblocks != n) {
    nblocks++;
  }
    
  edge = n%block_size;
  if (edge == 0) {
    edge = block_size;
  }
    
  #ifdef DEBUG
  if(me == 0)
    for (i=0;i<nblocks;i++) {
      for (j=0;j<nblocks;j++) 
	printf("%d ", block_owner(i, j));
      printf("\n");
    }
  MP_BARRIER();
  MP_FINALIZE();
  exit(0);
  #endif
    
  for (i=0;i<nblocks;i++) {
    for (j=0;j<nblocks;j++) {
      if(block_owner(i,j) == me) {
	if ((i == nblocks-1) && (j == nblocks-1)) {
	  size = edge*edge;
	}
	else if ((i == nblocks-1) || (j == nblocks-1)) {
	  size = edge*block_size;
	}
	else {
	  size = block_size*block_size;
	}
	proc_bytes += size*sizeof(double);
      }
    }
  }
    
  /* initialize ARMCI */
  ARMCI_Init();
  ptr = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(ptr, proc_bytes);
  
  a = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *));
  if (a == NULL) {
    fprintf(stderr, "Could not malloc memory for a\n");
    exit(-1);
  } 
  ptr_loc = (double **)ARMCI_Malloc_local(nproc*sizeof(double *));
  for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
  for(i=0; i<nblocks;i ++) {
    for(j=0; j<nblocks; j++) {
      a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
      if ((i == nblocks-1) && (j == nblocks-1)) {
	size = edge*edge;
      } else if ((i == nblocks-1) || (j == nblocks-1)) {
	size = edge*block_size;
      } else {
	size = block_size*block_size;
      }
      ptr_loc[block_owner(i, j)] += size;
    }
  }
    
  /* initialize the array */
  init_array();
  
  bufr = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *));
  bufc = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *));

  if (bufr == NULL || bufc == NULL)
    printf("Could not ARMCI_Malloc_local() mem\n");
  /* bufr points to all k-th row blocks */
  /* save all block address in row-major order */
  proc_bytes = nblocks*block_size*block_size * sizeof(double);
  bufr_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(bufr_g, proc_bytes);

  for (i = 0; i < nproc; i++) {
    bufr[i*nblocks] = (double *) bufr_g[i];
    for (j = 1; j < nblocks; j++) {
      bufr[i*nblocks + j]  = bufr[i*nblocks + j-1] + block_size * block_size;
    }
  }

  /* bufc points to all k-th column blocks */
  bufc_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *));
  ARMCI_Malloc(bufc_g, proc_bytes);

  for (i = 0; i < nproc; i++) {
    bufc[i*nblocks] = (double *) bufc_g[i];
    for (j = 1; j < nblocks; j++) {
      bufc[i*nblocks + j]  = bufc[i*nblocks + j-1] + block_size * block_size;
    }
  }

  /* barrier to ensure all initialization is done */
  MP_BARRIER();

  /* to remove cold-start misses, all processors touch their own data */
  touch_array(block_size, me);
  MP_BARRIER();

  if(doprint) {
    if(me == 0) {
      printf("Matrix before LU decomposition\n");
      print_array(me); 
    }
    MP_BARRIER();
  }  

  /* Starting the timer */
  if(me == 0) start_timer();

  lu(n, block_size, me);
  
  MP_BARRIER();

  /* Timer Stops here */
  if(me == 0) 
  printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time());

  if(doprint) {        
    if(me == 0) {
      printf("after LU\n");
      print_array(me);
    }
    MP_BARRIER();
  }
    
  /* done */
  ARMCI_Free(ptr[me]);
  ARMCI_Free(bufc_g[me]);
  ARMCI_Free(bufr_g[me]);
  ARMCI_Finalize();
  MP_FINALIZE();
}
Ejemplo n.º 10
0
void lu(int n, int bs, int me)
{
  int i, il, j, jl, k, kl;
  int I, J, K;
  double *A, *B, *C, *D;
  int dimI, dimJ, dimK;
  int strI, strJ, strK;
  unsigned int t1, t2, t3, t4, t11, t22;
  int diagowner, destp, hc, m;
  double *dbuf;
  armci_hdl_t handle[2*MAXPROC];
  int saved[MAXPROC];  
  
  dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double));

  for (k=0, K=0; k<n; k+=bs, K++) {
    kl = k + bs; 
    if (kl > n) {
      kl = n;
      strK = kl - k;
    } else {
      strK = bs;
    }
    
    /* factor diagonal block */
    diagowner = block_owner(K, K);
    if (diagowner == me) {
      A = a[K+K*nblocks]; 
      lu0(A, strK, strK); /* impl algo on this diag block */
    }
    MP_BARRIER(); 
    
    /* divide column k by diagonal block */
    if(block_owner(K, K) == me)
      D = a[K+K*nblocks];
    else {
      D = dbuf;
      get_remote(D, K, K);
    }
    
    for (i=kl, I=K+1; i<n; i+=bs, I++) {
      if (block_owner(I, K) == me) {  /* parcel out blocks */
	il = i + bs; 
	if (il > n) {
	  il = n;
	  strI = il - i;
	} else {
	  strI = bs;
	}
	A = a[I+K*nblocks]; 
	bdiv(A, D, strI, strK, strI, strK);
	
	/* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/
	memset (saved, 0, sizeof(saved));
	for (m = K+1; m < nblocks; m++) {
	    destp = block_owner (I, m);
	    if (destp != me && !saved[destp]) {
	      ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL);
	      saved[destp] = 1;
	    }
	}
      }
    } /* end of for (i=k1, I=K+1...) */
    
    /* modify row k by diagonal block */
    for (j=kl, J=K+1; j<n; j+=bs, J++) {
      if (block_owner(K, J) == me) {  /* parcel out blocks */
	jl = j+bs; 
	if (jl > n) {
	  jl = n;
	  strJ = jl - j;
	} else {
	  strJ = bs;
	}
	A = a[K+J*nblocks];
	bmodd(D, A, strK, strJ, strK, strK);
     
	/* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/
        memset (saved, 0, sizeof(saved));
        for (m = K+1; m < nblocks; m++) {
	  destp = block_owner (m, J);
	  if (destp != me  && !saved[destp]) {
	    ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL);
	    saved[destp] = 1;
	  }
	}
      }      
    }
        
    ARMCI_WaitAll();
    ARMCI_AllFence();
    MP_BARRIER();
    /* modify subsequent block columns */
    
    for (i=kl, I=K+1; i<n; i+=bs, I++) {
      il = i+bs; 
      if (il > n) {
	il = n;
	strI = il - i;
      } else {
	strI = bs;
      }

      for (j=kl, J=K+1; j<n; j+=bs, J++) {
	jl = j + bs; 
	if (jl > n) {
	  jl = n;
	  strJ= jl - j;
	} else {
	  strJ = bs;
	  }
	if (block_owner(I, J) == me) {  /* parcel out blocks */
	  if(block_owner(I,K) == me)
	    A = a[I+K*nblocks];
	  else {
	    A = bufc[me*nblocks+I];
          }
	  
	  if(block_owner(K,J) == me)
	    B = a[K+J*nblocks];
	  else
	    B = bufr[me*nblocks + J];
	    
	  C = a[I+J*nblocks];
	  bmod(A, B, C, strI, strJ, strK, strI, strK, strI);
	}
      }
    }
  }
  ARMCI_Free_local(dbuf);
}
Ejemplo n.º 11
0
void test_2D()
{
  int i = 0;
  int dst = 0;
  int g_a = 0;
  int shape[2] = {SIZE*nproc, SIZE};
  int dist[2] = {SIZE, SIZE};
  int lo[2] = {0,0};
  int hi[2] = {0,0};
  double *buf = NULL;

  /* allocate the GA */
#if defined(USE_ELEMENTAL)
  ElGlobalArraysCreate_d( eldga, 2, shape, "2d", dist, &g_a );
  ElGlobalArraysDistribution_d( eldga, g_a, me, lo, hi );
#else
  g_a = NGA_Create(C_DBL, 2, shape, "2d", dist);
  NGA_Distribution(g_a, me, lo, hi);
#endif
  assert(hi[0]-lo[0]+1 == SIZE);
  assert(hi[1]-lo[1]+1 == SIZE);

  /* memory allocation */
  if (me == 0) {
#if MALLOC_LOC
    buf = (double *)ARMCI_Malloc_local(SIZE * SIZE * sizeof(double));
    assert(buf != NULL);
#else
    buf = (double *)malloc(SIZE * SIZE * sizeof(double));
    assert(buf != NULL);
#endif
  }

  /* only the proc 0 does the work */
  if (me == 0) {
    if (!CHECK_RESULT) {
      printf("  section               get                 put                 acc\n");
      printf("bytes   loop      usec      MB/s      usec      MB/s      usec      MB/s\n");
      printf("------- ------  --------  --------  --------  --------  --------  --------\n");
      fflush(stdout);
    }

    for (i=0; i<CHUNK_NUM; ++i) {
      int loop;
      double intcal;
      int bytes = chunk[i] * chunk[i] * sizeof(double);
      double t_get = 0, t_put = 0, t_acc = 0;
      double latency_get, latency_put, latency_acc;
      double bandwidth_get, bandwidth_put, bandwidth_acc;

      intcal = (double)((double)(SIZE * SIZE) / (double)(chunk[i] * chunk[i]));
      loop = (int)(sqrt((double)intcal));
      if (loop < 2) {
        loop = 2;
      }

      for (dst=1; dst<nproc; ++dst) {
        /* strided get */
        fill_array(buf, SIZE * SIZE, me * 10);
        t_get += time_op(g_a, buf, chunk[i], loop, dst, 2, OP_GET);
        /* strided put */
        fill_array(buf, SIZE * SIZE, me * 10);
        t_put += time_op(g_a, buf, chunk[i], loop, dst, 2, OP_PUT);
        /* strided acc */
        fill_array(buf, SIZE * SIZE, me * 10);
        t_acc += time_op(g_a, buf, chunk[i], loop, dst, 2, OP_ACC);
      }

      latency_get = t_get / (nproc - 1);
      latency_put = t_put / (nproc - 1);
      latency_acc = t_acc / (nproc - 1);

      bandwidth_get = (bytes * (nproc - 1) * 1e-6) / t_get;
      bandwidth_put = (bytes * (nproc - 1) * 1e-6) / t_put;
      bandwidth_acc = (bytes * (nproc - 1) * 1e-6) / t_acc;

      /* print */
      if (!CHECK_RESULT) {
        printf("%d\t%d\t%.2e  %.2e  %.2e  %.2e  %.2e  %.2e\n",
               bytes, loop,
               latency_get / 1e-6, bandwidth_get,
               latency_put / 1e-6, bandwidth_put,
               latency_acc / 1e-6, bandwidth_acc);
      }
    }
  }
  else {
    sleep(3);
  }
#if defined(USE_ELEMENTAL)
  ElGlobalArraysSync_d( eldga );
#else
  GA_Sync();
#endif

#if ENABLE_CLEANUP
  /* cleanup */
#if defined(USE_ELEMENTAL)
  ElGlobalArraysDestroy_d( eldga, g_a );
#else
  GA_Destroy(g_a);
#endif
#endif

  if (me == 0) {
#if MALLOC_LOC
    ARMCI_Free_local(buf);
#else
    free(buf);
#endif
  }
}