Exemplo n.º 1
0
int main( int argc, char *argv[] )
{
    int errs = 0;
    int rank, size, source, dest;
    int minsize = 2, count, i; 
    MPI_Comm      comm;
    MPI_Win       win;
    MPI_Datatype  datatype;
    int           *winbuf, *sbuf;

    MTest_Init( &argc, &argv );

    /* The following illustrates the use of the routines to 
       run through a selection of communicators and datatypes.
       Use subsets of these for tests that do not involve combinations 
       of communicators, datatypes, and counts of datatypes */
    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
	if (comm == MPI_COMM_NULL) continue;
	/* Determine the sender and receiver */
	MPI_Comm_rank( comm, &rank );
	MPI_Comm_size( comm, &size );
	source = 0;
	dest   = size - 1;
	
	for (count = 1; count < 65000; count = count * 2) {
	    datatype = MPI_INT;
	    /* We compare with an integer value that can be as large as
	       size * (count * count + (1/2)*(size-1))
	       For large machines (size large), this can exceed the 
	       maximum integer for some large values of count.  We check
	       that in advance and break this loop if the above value 
	       would exceed MAX_INT.  Specifically,

	       size*count*count + (1/2)*size*(size-1) > MAX_INT
	       count*count > (MAX_INT/size - (1/2)*(size-1))
	    */
	    if (count * count > (MAX_INT/size - (size-1)/2)) break;
	    winbuf = (int *)malloc( count * sizeof(int) );
	    sbuf   = (int *)malloc( count * sizeof(int) );

	    for (i=0; i<count; i++) winbuf[i] = 0;
	    for (i=0; i<count; i++) sbuf[i] = rank + i * count;
	    MPI_Win_create( winbuf, count * sizeof(int), sizeof(int),
			    MPI_INFO_NULL, comm, &win );
	    MPI_Win_fence( 0, win );
	    MPI_Accumulate( sbuf, count, MPI_INT, source, 0, count, MPI_INT,
				MPI_SUM, win );
	    MPI_Win_fence( 0, win );
	    if (rank == source) {
		/* Check the results */
		for (i=0; i<count; i++) {
		    int result = i * count * size + (size*(size-1))/2;
		    if (winbuf[i] != result) {
			if (errs < 10) {
			    fprintf( stderr, "Winbuf[%d] = %d, expected %d (count = %d, size = %d)\n",
				     i, winbuf[i], result, count, size );
			}
			errs++;
		    }
		}
	    }
	    free( winbuf );
	    free( sbuf );
	    MPI_Win_free( &win );
	}
        MTestFreeComm(&comm);
    }

    MTest_Finalize( errs );
    MPI_Finalize();
    return 0;
}
int main(int argc, char *argv[])
{
    int i, j, length, my_rank, left, right, size, test_value, mid;
    double start, finish, transfer_time;
    float snd_buf_left[max_length], snd_buf_right[max_length];
    float *rcv_buf_left, *rcv_buf_right;

    MPI_Win win_rcv_buf_left, win_rcv_buf_right;

    /* Naming conventions                                                                */
    /* Processes:                                                                        */
    /*     my_rank-1                        my_rank                         my_rank+1    */
    /* "left neighbor"                     "myself"                     "right neighbor" */
    /*   ...    rcv_buf_right <--- snd_buf_left snd_buf_right ---> rcv_buf_left    ...   */
    /*   ... snd_buf_right ---> rcv_buf_left       rcv_buf_right <--- snd_buf_left ...   */
    /*                        |                                  |                       */
    /*              halo-communication                 halo-communication                */

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    right = (my_rank+1)      % size;
    left  = (my_rank-1+size) % size;

    MPI_Alloc_mem((MPI_Aint)(max_length*sizeof(float)), MPI_INFO_NULL, &rcv_buf_left );
    MPI_Alloc_mem((MPI_Aint)(max_length*sizeof(float)), MPI_INFO_NULL, &rcv_buf_right);
    MPI_Win_create(rcv_buf_left,  (MPI_Aint)(max_length*sizeof(float)), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &win_rcv_buf_left );
    MPI_Win_create(rcv_buf_right, (MPI_Aint)(max_length*sizeof(float)), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &win_rcv_buf_right);

    if (my_rank == 0) printf("    message size      transfertime  duplex bandwidth per process and neighbor\n");

    length = start_length;

    for (j = 1; j <= number_package_sizes; j++)
    {

        for (i = 0; i <= number_of_messages; i++)
        {
            if(i==1) start = MPI_Wtime();

            test_value = j*1000000 + i*10000 + my_rank*10 ;
            mid = (length-1)/number_of_messages*i;

            snd_buf_left[0]=test_value+1  ;
            snd_buf_left[mid]=test_value+2  ;
            snd_buf_left[length-1]=test_value+3;
            snd_buf_right[0]=test_value+6 ;
            snd_buf_right[mid]=test_value+7 ;
            snd_buf_right[length-1]=test_value+8;

            MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPRECEDE, win_rcv_buf_left );
            MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPRECEDE, win_rcv_buf_right);

            MPI_Put(snd_buf_left,  length, MPI_FLOAT, left,  (MPI_Aint)0, length, MPI_FLOAT, win_rcv_buf_right);
            MPI_Put(snd_buf_right, length, MPI_FLOAT, right, (MPI_Aint)0, length, MPI_FLOAT, win_rcv_buf_left );

            MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPUT + MPI_MODE_NOSUCCEED, win_rcv_buf_left );
            MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPUT + MPI_MODE_NOSUCCEED, win_rcv_buf_right);

            /*    ...snd_buf_... is used to store the values that were stored in snd_buf_... in the neighbor process */
            test_value = j*1000000 + i*10000 + left*10  ;
            mid = (length-1)/number_of_messages*i;
            snd_buf_right[0]=test_value+6 ;
            snd_buf_right[mid]=test_value+7 ;
            snd_buf_right[length-1]=test_value+8;
            test_value = j*1000000 + i*10000 + right*10 ;
            mid = (length-1)/number_of_messages*i;
            snd_buf_left[0]=test_value+1  ;
            snd_buf_left[mid]=test_value+2  ;
            snd_buf_left[length-1]=test_value+3;
            if ((rcv_buf_left[0] != snd_buf_right[0]) || (rcv_buf_left[mid] != snd_buf_right[mid]) ||
                    (rcv_buf_left[length-1] != snd_buf_right[length-1])) {
                printf("%d: j=%d, i=%d --> snd_buf_right[0,%d,%d]=(%f,%f,%f)\n",
                       my_rank, j, i, mid, length-1, snd_buf_right[0], snd_buf_right[mid], snd_buf_right[length-1]);
                printf("%d:     is not identical to rcv_buf_left[0,%d,%d]=(%f,%f,%f)\n",
                       my_rank,       mid, length-1, rcv_buf_left[0],  rcv_buf_left[mid],  rcv_buf_left[length-1]);
            }
            if ((rcv_buf_right[0] != snd_buf_left[0]) || (rcv_buf_right[mid] != snd_buf_left[mid]) ||
                    (rcv_buf_right[length-1] != snd_buf_left[length-1])) {
                printf("%d: j=%d, i=%d --> snd_buf_left[0,%d,%d]=(%f,%f,%f)\n",
                       my_rank, j, i, mid, length-1, snd_buf_left[0],  snd_buf_left[mid],  snd_buf_left[length-1]);
                printf("%d:     is not identical to rcv_buf_right[0,%d,%d]=(%f,%f,%f)\n",
                       my_rank,       mid, length-1, rcv_buf_right[0], rcv_buf_right[mid], rcv_buf_right[length-1]);
            }

        }
        finish = MPI_Wtime();

        if (my_rank == 0)
        {
            transfer_time = (finish - start) / number_of_messages;
            printf("%10i bytes %12.3f usec %13.3f MB/s\n",
                   length*(int)sizeof(float), transfer_time*1e6, 1.0e-6*2*length*sizeof(float) / transfer_time);
        }

        length = length * length_factor;
    }
    MPI_Win_free(&win_rcv_buf_left );
    MPI_Win_free(&win_rcv_buf_right);
    MPI_Free_mem(rcv_buf_left );
    MPI_Free_mem(rcv_buf_right);

    MPI_Finalize();
}
Exemplo n.º 3
0
int main(int argc, char **argv) {
    int           procid, nproc, i;
    MPI_Win       llist_win;
    llist_ptr_t   head_ptr, tail_ptr;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &procid);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);

    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &llist_win);

    /* Process 0 creates the head node */
    if (procid == 0)
        head_ptr.disp = alloc_elem(-1, llist_win);

    /* Broadcast the head pointer to everyone */
    head_ptr.rank = 0;
    MPI_Bcast(&head_ptr.disp, 1, MPI_AINT, 0, MPI_COMM_WORLD);
    tail_ptr = head_ptr;

    /* All processes concurrently append NUM_ELEMS elements to the list */
    for (i = 0; i < NUM_ELEMS; i++) {
        llist_ptr_t new_elem_ptr;
        int success;

        /* Create a new list element and register it with the window */
        new_elem_ptr.rank = procid;
        new_elem_ptr.disp = alloc_elem(procid, llist_win);

        /* Append the new node to the list.  This might take multiple attempts if
           others have already appended and our tail pointer is stale. */
        do {
            llist_ptr_t next_tail_ptr = nil;

            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);

            MPI_Compare_and_swap((void*) &new_elem_ptr.rank, (void*) &nil.rank,
                                  (void*) &next_tail_ptr.rank, MPI_INT, tail_ptr.rank,
                                  (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.rank), llist_win);

            MPI_Win_unlock(tail_ptr.rank, llist_win);
            success = (next_tail_ptr.rank == nil.rank);

            if (success) {
                int i, flag;

                MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);

                MPI_Put(&new_elem_ptr.disp, 1, MPI_AINT, tail_ptr.rank,
                        (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp), 1,
                        MPI_AINT, llist_win);

                MPI_Win_unlock(tail_ptr.rank, llist_win);
                tail_ptr = new_elem_ptr;

                /* For implementations that use pt-to-pt messaging, force progress for other threads'
                   RMA operations. */
                for (i = 0; i < NPROBE; i++)
                    MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);

            } else {
                /* Tail pointer is stale, fetch the displacement.  May take multiple tries
                   if it is being updated. */
                do {
                    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);

                    MPI_Get( &next_tail_ptr.disp, 1, MPI_AINT, tail_ptr.rank,
                             (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp),
                             1, MPI_AINT, llist_win);

                    MPI_Win_unlock(tail_ptr.rank, llist_win);
                } while (next_tail_ptr.disp == nil.disp);
                tail_ptr = next_tail_ptr;
            }
        } while (!success);
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /* Traverse the list and verify that all processes inserted exactly the correct
       number of elements. */
    if (procid == 0) {
        int  have_root = 0;
        int  errors    = 0;
        int *counts, count = 0;

        counts = (int*) malloc(sizeof(int) * nproc);
        assert(counts != NULL);

        for (i = 0; i < nproc; i++)
            counts[i] = 0;

        tail_ptr = head_ptr;

        /* Walk the list and tally up the number of elements inserted by each rank */
        while (tail_ptr.disp != nil.disp) {
            llist_elem_t elem;

            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);

            MPI_Get(&elem, sizeof(llist_elem_t), MPI_BYTE,
                    tail_ptr.rank, tail_ptr.disp, sizeof(llist_elem_t), MPI_BYTE, llist_win);

            MPI_Win_unlock(tail_ptr.rank, llist_win);

            tail_ptr = elem.next;

            /* This is not the root */
            if (have_root) {
                assert(elem.value >= 0 && elem.value < nproc);
                counts[elem.value]++;
                count++;

                if (verbose) {
                    int last_elem = tail_ptr.disp == nil.disp;
                    printf("%2d%s", elem.value, last_elem ? "" : " -> ");
                    if (count % ELEM_PER_ROW == 0 && !last_elem)
                        printf("\n");
                }
            }

            /* This is the root */
            else {
                assert(elem.value == -1);
                have_root = 1;
            }
        }

        if (verbose)
          printf("\n\n");

        /* Verify the counts we collected */
        for (i = 0; i < nproc; i++) {
            int expected = NUM_ELEMS;

            if (counts[i] != expected) {
                printf("Error: Rank %d inserted %d elements, expected %d\n", i, counts[i], expected);
                errors++;
            }
        }

        printf("%s\n", errors == 0 ? " No Errors" : "FAIL");
        free(counts);
    }

    MPI_Win_free(&llist_win);

    /* Free all the elements in the list */
    for ( ; my_elems_count > 0; my_elems_count--)
        MPI_Free_mem(my_elems[my_elems_count-1]);

    MPI_Finalize();
    return 0;
}
Exemplo n.º 4
0
int main(int argc, char **argv)
{
    int provided;
    MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided);
    assert(provided==MPI_THREAD_SINGLE);

    int me;
    int nproc;
    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    MPI_Comm_size(MPI_COMM_WORLD,&nproc);

    int status;
    double t0,t1,t2,t3,t4,t5;
    double tt0,tt1,tt2,tt3,tt4;

    int bufSize = ( argc>1 ? atoi(argv[1]) : 1000000 );
    if (me==0) printf("%d: bufSize = %d doubles\n",me,bufSize);

    /* allocate RMA buffers for windows */
    double* m1;
    double* m2;
    status = MPI_Alloc_mem(bufSize * sizeof(double), MPI_INFO_NULL, &m1);
    status = MPI_Alloc_mem(bufSize * sizeof(double), MPI_INFO_NULL, &m2);

    /* register remote pointers */
    MPI_Win w1;
    MPI_Win w2;
    status = MPI_Win_create(m1, bufSize * sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &w1);
    status = MPI_Win_create(m2, bufSize * sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &w2);
    MPI_Barrier(MPI_COMM_WORLD);

    /* allocate RMA buffers */
    double* b1;
    double* b2;
    status = MPI_Alloc_mem(bufSize * sizeof(double), MPI_INFO_NULL, &b1);
    status = MPI_Alloc_mem(bufSize * sizeof(double), MPI_INFO_NULL, &b2);

    /* initialize buffers */
    int i;
    for (i=0;i<bufSize;i++) b1[i]=1.0*me;
    for (i=0;i<bufSize;i++) b2[i]=-1.0;

    status = MPI_Win_fence( MPI_MODE_NOPRECEDE | MPI_MODE_NOSTORE , w1 );
    status = MPI_Win_fence( MPI_MODE_NOPRECEDE | MPI_MODE_NOSTORE , w2);
    status = MPI_Put(b1, bufSize, MPI_DOUBLE, me, 0, bufSize, MPI_DOUBLE, w1);
    status = MPI_Put(b2, bufSize, MPI_DOUBLE, me, 0, bufSize, MPI_DOUBLE, w2);
    status = MPI_Win_fence( MPI_MODE_NOSTORE , w1);
    status = MPI_Win_fence( MPI_MODE_NOSTORE , w2);

    int target;
    int j;
    double dt,bw;
    MPI_Barrier(MPI_COMM_WORLD);
    if (me==0){
        printf("MPI_Get performance test for buffer size = %d doubles\n",bufSize);
        printf("  jump    host   target       get (s)       BW (MB/s)\n");
        printf("===========================================================\n");
        fflush(stdout);
    }
    MPI_Barrier(MPI_COMM_WORLD);
    for (j=0;j<nproc;j++){
        target = (me+j) % nproc;
        MPI_Barrier(MPI_COMM_WORLD);
        t0 = MPI_Wtime();
        status = MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, MPI_MODE_NOCHECK, w1);
        t1 = MPI_Wtime();
        status = MPI_Get(b2, bufSize, MPI_DOUBLE, target, 0, bufSize, MPI_DOUBLE, w1);
        t2 = MPI_Wtime();
        status = MPI_Win_unlock(target, w1);
        t3 = MPI_Wtime();
        for (i=0;i<bufSize;i++) assert( b2[i]==(1.0*target) );
        dt = t3 - t0;
        bw = (double)bufSize*sizeof(double)*(1e-6)/dt;
        printf("%4d     %4d     %4d       %9.6f     %9.3f\n",j,me,target,dt,bw);
        fflush(stdout);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    status = MPI_Win_free(&w2);
    status = MPI_Win_free(&w1);

    status = MPI_Free_mem(b2);
    status = MPI_Free_mem(b1);

    status = MPI_Free_mem(m2);
    status = MPI_Free_mem(m1);

    MPI_Barrier(MPI_COMM_WORLD);

    if (me==0) printf("%d: MPI_Finalize\n",me);
    MPI_Finalize();

    return(0);
}
Exemplo n.º 5
0
int main(int argc, char *argv[])
{
    int i, j;
    int ch;
    extern char *optarg;
    int edge;
    int size;
    int nloop=5;
    double **ptr_loc;
    
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    
    while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) {
        switch(ch) {
            case 'n': n = atoi(optarg); break;
            case 'b': block_size = atoi(optarg); break;
            case 'p': nproc = atoi(optarg); break;
            case 'h': {
                printf("Usage: LU, or \n");
        printf("       LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n");
                MPI_Barrier(MPI_COMM_WORLD);
                MPI_Finalize();
                exit(0);
            }            
        }
    }
    
    if(me == 0) {
        printf("\n Blocked Dense LU Factorization\n");
        printf("     %d by %d Matrix\n", n, n);
        printf("     %d Processors\n", nproc);
        printf("     %d by %d Element Blocks\n", block_size, block_size);
        printf("\n");
    }

    if (n % block_size != 0) {
      if (me == 0) printf("Error n is not a multiple of block_size\n");
      MPI_Abort(MPI_COMM_WORLD, 1);
    }

    num_rows = (int) sqrt((double) nproc);
    for (;;) {
        num_cols = nproc/num_rows;
        if (num_rows*num_cols == nproc)
            break;
        num_rows--;
    }
    
    nblocks = n/block_size;
    if (block_size * nblocks != n) {
        nblocks++;
    }
    
    edge = n%block_size;
    if (edge == 0) {
        edge = block_size;
    }
    
#ifdef DEBUG
    if(me == 0)
        for (i=0;i<nblocks;i++) {
            for (j=0;j<nblocks;j++) 
                printf("%d ", block_owner(i, j));
            printf("\n");
        }
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Finalize();
    exit(0);
#endif
    
    for (i=0;i<nblocks;i++) {
        for (j=0;j<nblocks;j++) {
            if(block_owner(i,j) == me) {
                if ((i == nblocks-1) && (j == nblocks-1)) {
                    size = edge*edge;
                }
                else if ((i == nblocks-1) || (j == nblocks-1)) {
                    size = edge*block_size;
                }
                else {
                    size = block_size*block_size;
                }
                proc_bytes += size*sizeof(double);
            }
        }
    }
    
    ptr = (void **)malloc(nproc * sizeof(void *));
#ifdef MPI2_ONESIDED
    MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]);
    MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL,
                   MPI_COMM_WORLD, &win);
    for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me];
    MPI_Barrier(MPI_COMM_WORLD);
    
#else
    /* initialize ARMCI */
    ARMCI_Init();
    ARMCI_Malloc(ptr, proc_bytes);
#endif
    
    a = (double **)malloc(nblocks*nblocks*sizeof(double *));
    if (a == NULL) {
        fprintf(stderr, "Could not malloc memory for a\n");
        exit(-1);
    } 
    ptr_loc = (double **)malloc(nproc*sizeof(double *));
    for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i];
    for(i=0; i<nblocks;i ++) {
        for(j=0; j<nblocks; j++) {
            a[i+j*nblocks] = ptr_loc[block_owner(i, j)];
            if ((i == nblocks-1) && (j == nblocks-1)) {
                size = edge*edge;
            } else if ((i == nblocks-1) || (j == nblocks-1)) {
                size = edge*block_size;
            } else {
                size = block_size*block_size;
            }
            ptr_loc[block_owner(i, j)] += size;
        }
    }
    
    /* initialize the array */
    init_array();
    
    /* barrier to ensure all initialization is done */
    MPI_Barrier(MPI_COMM_WORLD);

    /* to remove cold-start misses, all processors touch their own data */
    touch_array(block_size, me);
    MPI_Barrier(MPI_COMM_WORLD);

    if(doprint) {
        if(me == 0) {
            printf("Matrix before LU decomposition\n");
            print_array(me); 
        }
        MPI_Barrier(MPI_COMM_WORLD);
    }
    
    lu(n, block_size, me); /* cold start */

    /* Starting the timer */

    MPI_Barrier(MPI_COMM_WORLD);
    if(me == 0) start_timer();
    for(i=0; i<nloop; i++) lu(n, block_size, me);    
    MPI_Barrier(MPI_COMM_WORLD);

    /* Timer Stops here */
    if(me == 0) 
        printf("\nRunning time = %lf milliseconds.\n\n",  elapsed_time()/nloop);
    printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop);
    
    if(doprint) {        
        if(me == 0) {
            printf("after LU\n");
            print_array(me);
        }
        MPI_Barrier(MPI_COMM_WORLD);
    }
    
    /* done */
#ifdef MPI2_ONESIDED
    MPI_Win_free(&win);
    MPI_Free_mem(ptr[me]);
#else
    ARMCI_Free(ptr[me]);
    ARMCI_Finalize();
#endif
    MPI_Finalize();

    return 0;
}
Exemplo n.º 6
0
/* This BFS represents its queues as bitmaps and uses some data representation
 * tricks to fit with the use of MPI one-sided operations.  It is not much
 * faster than the standard version on the machines I have tested it on, but
 * systems that have good RDMA hardware and good MPI one-sided implementations
 * might get better performance from it.  This code might also be good to
 * translate to UPC, Co-array Fortran, SHMEM, or GASNet since those systems are
 * more designed for one-sided remote memory operations. */
void run_mpi_bfs(const csr_graph* const g, int64_t root, int64_t* pred, int64_t* nvisited) {
  const size_t nlocalverts = g->nlocalverts;
  const int64_t nglobalverts = g->nglobalverts;
  int64_t nvisited_local = 0;

  /* Set up a second predecessor map so we can read from one and modify the
   * other. */
  int64_t* orig_pred = pred;
  int64_t* pred2 = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t));

  /* The queues (old and new) are represented as bitmaps.  Each bit in the
   * queue bitmap says to check elts_per_queue_bit elements in the predecessor
   * map for vertices that need to be visited.  In other words, the queue
   * bitmap is an overapproximation of the actual queue; because MPI_Accumulate
   * does not get any information on the result of the update, sometimes
   * elements are also added to the bitmap when they were actually already
   * black.  Because of this, the predecessor map needs to be checked to be
   * sure a given vertex actually needs to be processed. */
  const int elts_per_queue_bit = 4;
  const int ulong_bits = sizeof(unsigned long) * CHAR_BIT;
  int64_t queue_nbits = (nlocalverts + elts_per_queue_bit - 1) / elts_per_queue_bit;
  int64_t queue_nwords = (queue_nbits + ulong_bits - 1) / ulong_bits;
  unsigned long* queue_bitmap1 = (unsigned long*)xMPI_Alloc_mem(queue_nwords * sizeof(unsigned long));
  unsigned long* queue_bitmap2 = (unsigned long*)xMPI_Alloc_mem(queue_nwords * sizeof(unsigned long));
  memset(queue_bitmap1, 0, queue_nwords * sizeof(unsigned long));

  /* List of local vertices (used as sources in MPI_Accumulate). */
  int64_t* local_vertices = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t));
  {size_t i; for (i = 0; i < nlocalverts; ++i) local_vertices[i] = VERTEX_TO_GLOBAL(i);}

  /* List of all bit masks for an unsigned long (used as sources in
   * MPI_Accumulate). */
  unsigned long masks[ulong_bits];
  {int i; for (i = 0; i < ulong_bits; ++i) masks[i] = (1UL << i);}

  /* Coding of predecessor map: */
  /* - White (not visited): INT64_MAX */
  /* - Grey (in queue): 0 .. nglobalverts-1 */
  /* - Black (done): -nglobalverts .. -1 */

  /* Set initial predecessor map. */
  {size_t i; for (i = 0; i < nlocalverts; ++i) pred[i] = INT64_MAX;}

  /* Mark root as grey and add it to the queue. */
  if (VERTEX_OWNER(root) == rank) {
    pred[VERTEX_LOCAL(root)] = root;
    queue_bitmap1[VERTEX_LOCAL(root) / elts_per_queue_bit / ulong_bits] |= (1UL << ((VERTEX_LOCAL(root) / elts_per_queue_bit) % ulong_bits));
  }

  /* Create MPI windows on the two predecessor arrays and the two queues. */
  MPI_Win pred_win, pred2_win, queue1_win, queue2_win;
  MPI_Win_create(pred, nlocalverts * sizeof(int64_t), sizeof(int64_t), MPI_INFO_NULL, MPI_COMM_WORLD, &pred_win);
  MPI_Win_create(pred2, nlocalverts * sizeof(int64_t), sizeof(int64_t), MPI_INFO_NULL, MPI_COMM_WORLD, &pred2_win);
  MPI_Win_create(queue_bitmap1, queue_nwords * sizeof(unsigned long), sizeof(unsigned long), MPI_INFO_NULL, MPI_COMM_WORLD, &queue1_win);
  MPI_Win_create(queue_bitmap2, queue_nwords * sizeof(unsigned long), sizeof(unsigned long), MPI_INFO_NULL, MPI_COMM_WORLD, &queue2_win);

  while (1) {
    int64_t i;
    /* Clear the next-level queue. */
    memset(queue_bitmap2, 0, queue_nwords * sizeof(unsigned long));

    /* The pred2 array is pred with all grey vertices changed to black. */
    memcpy(pred2, pred, nlocalverts * sizeof(int64_t));
    for (i = 0; i < (int64_t)nlocalverts; ++i) {
      if (pred2[i] >= 0 && pred2[i] < nglobalverts) pred2[i] -= nglobalverts;
    }

    /* Start one-sided operations for this level. */
    MPI_Win_fence(MPI_MODE_NOPRECEDE, pred2_win);
    MPI_Win_fence(MPI_MODE_NOPRECEDE, queue2_win);

    /* Step through the words of the queue bitmap. */
    for (i = 0; i < queue_nwords; ++i) {
      unsigned long val = queue_bitmap1[i];
      int bitnum;
      /* Skip any that are all zero. */
      if (!val) continue;
      /* Scan the bits in the word. */
      for (bitnum = 0; bitnum < ulong_bits; ++bitnum) {
        size_t first_v_local = (size_t)((i * ulong_bits + bitnum) * elts_per_queue_bit);
        if (first_v_local >= nlocalverts) break;
        int bit = (int)((val >> bitnum) & 1);
        /* Skip any that are zero. */
        if (!bit) continue;
        /* Scan the queue elements corresponding to this bit. */
        int qelem_idx;
        for (qelem_idx = 0; qelem_idx < elts_per_queue_bit; ++qelem_idx) {
          size_t v_local = first_v_local + qelem_idx;
          if (v_local >= nlocalverts) continue;
          /* Since the queue is an overapproximation, check the predecessor map
           * to be sure this vertex is grey. */
          if (pred[v_local] >= 0 && pred[v_local] < nglobalverts) {
            ++nvisited_local;
            size_t ei, ei_end = g->rowstarts[v_local + 1];
            /* Walk the incident edges. */
            for (ei = g->rowstarts[v_local]; ei < ei_end; ++ei) {
              int64_t w = g->column[ei];
              if (w == VERTEX_TO_GLOBAL(v_local)) continue; /* Self-loop */
              /* Set the predecessor of the other edge endpoint (note use of
               * MPI_MIN and the coding of the predecessor map). */
              MPI_Accumulate(&local_vertices[v_local], 1, INT64_T_MPI_TYPE, VERTEX_OWNER(w), VERTEX_LOCAL(w), 1, INT64_T_MPI_TYPE, MPI_MIN, pred2_win);
              /* Mark the endpoint in the remote queue (note that the min may
               * not do an update, so the queue is an overapproximation in this
               * way as well). */
              MPI_Accumulate(&masks[((VERTEX_LOCAL(w) / elts_per_queue_bit) % ulong_bits)], 1, MPI_UNSIGNED_LONG, VERTEX_OWNER(w), VERTEX_LOCAL(w) / elts_per_queue_bit / ulong_bits, 1, MPI_UNSIGNED_LONG, MPI_BOR, queue2_win);
            }
          }
        }
      }
    }
    /* End one-sided operations. */
    MPI_Win_fence(MPI_MODE_NOSUCCEED, queue2_win);
    MPI_Win_fence(MPI_MODE_NOSUCCEED, pred2_win);

    /* Test if there are any elements in the next-level queue (globally); stop
     * if none. */
    int any_set = 0;
    for (i = 0; i < queue_nwords; ++i) {
      if (queue_bitmap2[i] != 0) {any_set = 1; break;}
    }
    MPI_Allreduce(MPI_IN_PLACE, &any_set, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);
    if (!any_set) break;

    /* Swap queues and predecessor maps. */
    {MPI_Win temp = queue1_win; queue1_win = queue2_win; queue2_win = temp;}
    {unsigned long* temp = queue_bitmap1; queue_bitmap1 = queue_bitmap2; queue_bitmap2 = temp;}
    {MPI_Win temp = pred_win; pred_win = pred2_win; pred2_win = temp;}
    {int64_t* temp = pred; pred = pred2; pred2 = temp;}
  }
  MPI_Win_free(&pred_win);
  MPI_Win_free(&pred2_win);
  MPI_Win_free(&queue1_win);
  MPI_Win_free(&queue2_win);
  MPI_Free_mem(local_vertices);
  MPI_Free_mem(queue_bitmap1);
  MPI_Free_mem(queue_bitmap2);

  /* Clean up the predecessor map swapping since the surrounding code does not
   * allow the BFS to change the predecessor map pointer. */
  if (pred2 != orig_pred) {
    memcpy(orig_pred, pred2, nlocalverts * sizeof(int64_t));
    MPI_Free_mem(pred2);
  } else {
    MPI_Free_mem(pred);
  }

  /* Change from special coding of predecessor map to the one the benchmark
   * requires. */
  size_t i;
  for (i = 0; i < nlocalverts; ++i) {
    if (orig_pred[i] < 0) {
      orig_pred[i] += nglobalverts;
    } else if (orig_pred[i] == INT64_MAX) {
      orig_pred[i] = -1;
    }
  }

  /* Count visited vertices. */
  MPI_Allreduce(MPI_IN_PLACE, &nvisited_local, 1, INT64_T_MPI_TYPE, MPI_SUM, MPI_COMM_WORLD);
  *nvisited = nvisited_local;
}
Exemplo n.º 7
0
int main(int argc, char **argv)
{
    FILE    *fp, *fp2;
    char    testName[32] = "MPI_Rget", file1[64], file2[64];
    int     dblSize, proc, nprocs, npairs, partner;
    unsigned int i, j, k, size, localSize, NLOOP = NLOOP_MAX;
    unsigned int smin = MIN_P2P_SIZE, smed = MED_P2P_SIZE, smax = MAX_P2P_SIZE;
    double  tScale = USEC, bwScale = MB_8;
    double  tStart, timeMin, timeMinGlobal, overhead, threshold_lo, threshold_hi;
    double  msgBytes, sizeBytes, localMax, UsedMem;
    double  tElapsed[NREPS], tElapsedGlobal[NREPS];
    double  *A, *B;
    MPI_Win     win;
    MPI_Status  stat;
    MPI_Request req;

    // Initialize parallel environment
    MPI_Init(&argc, &argv);
    MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
    MPI_Comm_rank( MPI_COMM_WORLD, &proc );

    // Test input parameters
    if( nprocs%2 != 0 && proc == 0 )
        fatalError( "P2P test requires an even number of processors" );

    // Check for user defined limits
    checkEnvP2P( proc, &NLOOP, &smin, &smed, &smax );

    // Initialize local variables
    localMax = 0.0;
    npairs   = nprocs/2;
    if( proc < npairs  ) partner = proc + npairs;
    if( proc >= npairs ) partner = proc - npairs;
    UsedMem = (double)smax*(double)sizeof(double)*2.0;

    // Allocate and initialize arrays
    srand( SEED );
    A = doubleVector( smax );
    B = doubleVector( smax );

    // Open output file and write header
    if( proc == 0 ){
        // Check timer overhead in seconds
        timerTest( &overhead, &threshold_lo, &threshold_hi );
        // Open output files and write headers
        sprintf( file1, "rget_time-np_%.4d.dat", nprocs );
        sprintf( file2, "rget_bw-np_%.4d.dat",   nprocs );
        fp  = fopen( file1, "a" );
        fp2 = fopen( file2, "a" );
        printHeaders( fp, fp2, testName, UsedMem, overhead, threshold_lo );
    }

    // Get type size
    MPI_Type_size( MPI_DOUBLE, &dblSize );
    // Set up a window for RMA
    MPI_Win_create( A, smax*dblSize, dblSize, MPI_INFO_NULL, MPI_COMM_WORLD, &win );
    MPI_Win_lock_all( 0, win );
    
    //================================================================
    // Single loop with minimum size to verify that inner loop length  
    // is long enough for the timings to be accurate                     
    //================================================================
    // Warmup with a medium size message
    if( proc < npairs ){
        MPI_Rget( B, smed, MPI_DOUBLE, partner, 0, smed, MPI_DOUBLE, win, &req );
        MPI_Wait( &req, &stat );
        MPI_Win_flush_all( win );
    }
    
    // Test if current NLOOP is enough to capture fastest test cases
    MPI_Barrier( MPI_COMM_WORLD );
    tStart = benchTimer();
    if( proc < npairs ){
        for(j = 0; j < NLOOP; j++){
        	MPI_Rget( B, smin, MPI_DOUBLE, partner, 0, smin, MPI_DOUBLE, win, &req );
        	MPI_Wait( &req, &stat );
        	MPI_Win_flush_all( win );
        }
    }
    timeMin = benchTimer() - tStart;
    MPI_Reduce( &timeMin, &timeMinGlobal, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD );
    if( proc == 0 ) resetInnerLoop( timeMinGlobal, threshold_lo, &NLOOP );
    MPI_Bcast( &NLOOP, 1, MPI_INT, 0, MPI_COMM_WORLD );


    //================================================================
    // Execute test for each requested size                  
    //================================================================
    for( size = smin; size <= smax; size = size*2 ){

        // Warmup with a medium size message
        if( proc < npairs ){
            MPI_Rget( B, smed, MPI_DOUBLE, partner, 0, smed, MPI_DOUBLE, win, &req );
            MPI_Wait( &req, &stat );
            MPI_Win_flush_all( win );
        }

        // Repeat NREPS to collect statistics
        for(i = 0; i < NREPS; i++){
            MPI_Barrier( MPI_COMM_WORLD );
            tStart = benchTimer();
            if( proc < npairs ){
                for(j = 0; j < NLOOP; j++){
        	        MPI_Rget( B, size, MPI_DOUBLE, partner, 0, size, MPI_DOUBLE, win, &req );
        	        MPI_Wait( &req, &stat );
        	        MPI_Win_flush_all( win );
                }
            }
        	tElapsed[i] = benchTimer() - tStart;
        }
        MPI_Reduce( tElapsed, tElapsedGlobal, NREPS, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD );
        // Only task 0 needs to do the analysis of the collected data
        if( proc == 0 ){
            // sizeBytes is size to write to file
            // msgBytes is actual data exchanged on the wire
            msgBytes  = (double)size*(double)npairs*(double)dblSize;
            sizeBytes = (double)size*(double)dblSize;
            post_process( fp, fp2, threshold_hi, tElapsedGlobal, tScale, 
                          bwScale, size*dblSize, sizeBytes, msgBytes, &NLOOP, 
                          &localMax, &localSize );
        }
        MPI_Bcast( &NLOOP, 1, MPI_INT, 0, MPI_COMM_WORLD );

    }
    MPI_Win_unlock_all( win );
    
    MPI_Win_free( &win );
    MPI_Barrier( MPI_COMM_WORLD );
    free( A );
    free( B );

    //================================================================
    // Print completion message, free memory and exit                  
    //================================================================
    if( proc == 0 ){
        printSummary( fp2, testName, localMax, localSize );
        fclose( fp2 ); 
        fclose( fp );
    }

    MPI_Finalize();
    return 0;
}
Exemplo n.º 8
0
int main(int argc, char *argv[])
{
    int errs = 0;
    MPI_Win win;
    int *rmabuffer = 0, *getbuf = 0;
    MPI_Aint bufsize = 0, getbufsize = 0;
    int master, partner, next, wrank, wsize, i;
    int ntest = LAST_TEST;
    int *srcbuf;

    MTest_Init(&argc, &argv);

    /* Determine who is responsible for each part of the test */
    MPI_Comm_rank(MPI_COMM_WORLD, &wrank);
    MPI_Comm_size(MPI_COMM_WORLD, &wsize);
    if (wsize < 3) {
        fprintf(stderr, "This test requires at least 3 processes\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    master = 0;
    partner = 1;
    next = wrank + 1;
    if (next == partner)
        next++;
    if (next >= wsize) {
        next = 0;
        if (next == partner)
            next++;
    }

    /* Determine the last test to run (by default, run them all) */
    for (i = 1; i < argc; i++) {
        if (strcmp("-ntest", argv[i]) == 0) {
            i++;
            if (i < argc) {
                ntest = atoi(argv[i]);
            } else {
                fprintf(stderr, "Missing value for -ntest\n");
                MPI_Abort(MPI_COMM_WORLD, 1);
            }
        }
    }

    MPI_Type_vector(veccount, 1, stride, MPI_INT, &vectype);
    MPI_Type_commit(&vectype);

    /* Create the RMA window */
    bufsize = 0;
    if (wrank == master) {
        bufsize = RMA_SIZE;
        MPI_Alloc_mem(bufsize * sizeof(int), MPI_INFO_NULL, &rmabuffer);
    } else if (wrank == partner) {
        getbufsize = RMA_SIZE;
        getbuf = (int *) malloc(getbufsize * sizeof(int));
        if (!getbuf) {
            fprintf(stderr, "Unable to allocated %d bytes for getbuf\n", (int) getbufsize);
            MPI_Abort(MPI_COMM_WORLD, 1);
        }
    }
    srcbuf = malloc(RMA_SIZE * sizeof(*srcbuf));
    assert(srcbuf);

    MPI_Win_create(rmabuffer, bufsize * sizeof(int), sizeof(int), MPI_INFO_NULL,
                   MPI_COMM_WORLD, &win);

    /* Run a sequence of tests */
    for (i = 0; i <= ntest; i++) {
        if (wrank == master) {
            MTestPrintfMsg(0, "Test %d\n", i);
            /* Because this lock is local, it must return only when the
             * lock is acquired */
            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, master, win);
            RMATestInit(i, rmabuffer, bufsize);
            MPI_Send(MPI_BOTTOM, 0, MPI_INT, partner, i, MPI_COMM_WORLD);
            MPI_Send(MPI_BOTTOM, 0, MPI_INT, next, i, MPI_COMM_WORLD);
            MPI_Recv(MPI_BOTTOM, 0, MPI_INT, MPI_ANY_SOURCE, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
            MPI_Win_unlock(master, win);
            MPI_Recv(MPI_BOTTOM, 0, MPI_INT, partner, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
            errs += RMACheck(i, rmabuffer, bufsize);
        } else if (wrank == partner) {
            MPI_Recv(MPI_BOTTOM, 0, MPI_INT, master, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, master, win);
            RMATest(i, win, master, srcbuf, RMA_SIZE, getbuf, getbufsize);
            MPI_Win_unlock(master, win);
            errs += RMACheckGet(i, win, getbuf, getbufsize);
            MPI_Send(MPI_BOTTOM, 0, MPI_INT, master, i, MPI_COMM_WORLD);
        } else {
            MPI_Recv(MPI_BOTTOM, 0, MPI_INT, MPI_ANY_SOURCE, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
            MPI_Send(MPI_BOTTOM, 0, MPI_INT, next, i, MPI_COMM_WORLD);
        }
    }

    if (rmabuffer) {
        MPI_Free_mem(rmabuffer);
    }
    if (getbuf) {
        free(getbuf);
    }
    free(srcbuf);
    MPI_Win_free(&win);
    MPI_Type_free(&vectype);

    MTest_Finalize(errs);
    return MTestReturnValue(errs);
}
Exemplo n.º 9
0
static inline int test(MPI_Comm comm, int rank, int source, int dest,
                       MTestDatatype * sendtype, MTestDatatype * recvtype)
{
    int errs = 0, err;
    MPI_Aint extent, lb;
    MPI_Win win;

    MTestPrintfMsg(1,
                   "Putting count = %ld of sendtype %s - count = %ld receive type %s\n",
                   sendtype->count, MTestGetDatatypeName(sendtype), recvtype->count,
                   MTestGetDatatypeName(recvtype));

    /* Make sure that everyone has a recv buffer */
    recvtype->InitBuf(recvtype);
    MPI_Type_extent(recvtype->datatype, &extent);
    MPI_Type_lb(recvtype->datatype, &lb);
    MPI_Win_create(recvtype->buf, recvtype->count * extent + lb, extent, MPI_INFO_NULL, comm, &win);
    MPI_Win_fence(0, win);
    if (rank == source) {
        /* To improve reporting of problems about operations, we
         * change the error handler to errors return */
        MPI_Win_set_errhandler(win, MPI_ERRORS_RETURN);

        sendtype->InitBuf(sendtype);

        err = MPI_Put(sendtype->buf, sendtype->count,
                      sendtype->datatype, dest, 0, recvtype->count, recvtype->datatype, win);
        if (err) {
            errs++;
            if (errs < 10) {
                MTestPrintError(err);
            }
        }
        err = MPI_Win_fence(0, win);
        if (err) {
            errs++;
            if (errs < 10) {
                MTestPrintError(err);
            }
        }
    }
    else if (rank == dest) {
        MPI_Win_fence(0, win);
        /* This should have the same effect, in terms of
         * transfering data, as a send/recv pair */
        err = MTestCheckRecv(0, recvtype);
        if (err) {
            if (errs < 10) {
                printf
                    ("Data in target buffer did not match for destination datatype %s (put with source datatype %s)\n",
                     MTestGetDatatypeName(recvtype), MTestGetDatatypeName(sendtype));
                /* Redo the test, with the errors printed */
                recvtype->printErrors = 1;
                (void) MTestCheckRecv(0, recvtype);
            }
            errs += err;
        }
    }
    else {
        MPI_Win_fence(0, win);
    }
    MPI_Win_free(&win);

    return errs;
}
Exemplo n.º 10
0
int main(int argc, char **argv)
{
    int provided;
    int rank;
    int size;
    int status;

    double t0, t1, t2, t3, t4, t5;

    int i, j, k;

    int bufPow, bufSize;
    int msgPow, msgSize;

    double* m1;
    double* b1;
    double* b2;
    MPI_Win w1;

    int target;
    double dt, bw;

    MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    bufPow = (argc > 1 ? atoi(argv[1]) : 25);
    bufSize = pow(2,bufPow);
    if (rank == 0) printf("%d: bufSize = %d doubles\n", rank, bufSize);

    /* allocate RMA buffers for windows */

    status = MPI_Alloc_mem(bufSize * sizeof(double), MPI_INFO_NULL, &m1);
    assert(status==MPI_SUCCESS);

    for (i = 0; i < bufSize; i++)
    {
        m1[i] = (double)0;
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /* register remote pointers */

    status = MPI_Win_create(m1,
                            bufSize * sizeof(double),
                            sizeof(double),
                            MPI_INFO_NULL,
                            MPI_COMM_WORLD,
                            &w1);
    assert(status==MPI_SUCCESS);

    MPI_Barrier(MPI_COMM_WORLD);

    /* allocate RMA buffers */
    status = MPI_Alloc_mem(bufSize * sizeof(double), MPI_INFO_NULL, &b1);
    assert(status==MPI_SUCCESS);

    status = MPI_Alloc_mem(bufSize * sizeof(double), MPI_INFO_NULL, &b2);
    assert(status==MPI_SUCCESS);

    for (k = 0; k < msgSize; k++)
    {
        b2[k] = (double)rank;
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /* begin test */

    if (rank == 0)
    {
        printf("MPI_Get performance test for buffer size = %d doubles\n",
               bufSize);
        printf("host      target       msg. size (doubles)     get (sec)     BW (MB/s)\n");
        printf("======================================================================\n");
        fflush(stdout);

        for (i = 1; i < bufPow; i++)
        {
            msgPow = i;
            msgSize = pow(2,msgPow);

            for (j = 1; j < size; j++)
            {
                target = j;

                for (k = 0; k < msgSize; k++)
                {
                    b1[k] = -1.0*rank;
                }

                /* this communication is just to initialize the remote buffer */

                status = MPI_Win_lock(MPI_LOCK_EXCLUSIVE,
                                      target,
                                      MPI_MODE_NOCHECK,
                                      w1);
                assert(status==MPI_SUCCESS);

                status = MPI_Put(b1,
                                 msgSize,
                                 MPI_DOUBLE,
                                 target,
                                 0,
                                 msgSize,
                                 MPI_DOUBLE,
                                 w1);
                assert(status==MPI_SUCCESS);

                status = MPI_Win_unlock(target, w1);
                assert(status==MPI_SUCCESS);

                /* this is the real communication to time */

                t0 = MPI_Wtime();

                status = MPI_Win_lock(MPI_LOCK_EXCLUSIVE,
                                      target,
                                      MPI_MODE_NOCHECK,
                                      w1);
                assert(status==MPI_SUCCESS);

                t1 = MPI_Wtime();

                status = MPI_Acc(b2,
                                 msgSize,
                                 MPI_DOUBLE,
                                 target,
                                 0,
                                 msgSize,
                                 MPI_DOUBLE,
                                 MPI_SUM,
                                 w1);
                assert(status==MPI_SUCCESS);

                t2 = MPI_Wtime();

                status = MPI_Win_unlock(target, w1);
                assert(status==MPI_SUCCESS);

                t3 = MPI_Wtime();

                /* this communication is just to verify the remote buffer */

                status = MPI_Win_lock(MPI_LOCK_EXCLUSIVE,
                                      target,
                                      MPI_MODE_NOCHECK,
                                      w1);
                assert(status==MPI_SUCCESS);

                status = MPI_Get(b1,
                                 msgSize,
                                 MPI_DOUBLE,
                                 target,
                                 0,
                                 msgSize,
                                 MPI_DOUBLE,
                                 w1);
                assert(status==MPI_SUCCESS);

                status = MPI_Win_unlock(target, w1);
                assert(status==MPI_SUCCESS);

                for (k = 0; k < msgSize; k++)
                {
                    assert( b2[k]==0.0 );
                }

                dt = t3 - t0;
                bw = (double) msgSize * sizeof(double) * (1e-6) / dt;

                printf("%4d     %4d     %4d       %9.6f     %9.3f\n", rank, target, msgSize, dt, bw);
                fflush(stdout);

            }
            printf("======================================================================\n");
            fflush(stdout);
        }
    }
    MPI_Barrier(MPI_COMM_WORLD);

    status = MPI_Win_free(&w1);
    assert(status==MPI_SUCCESS);

    status = MPI_Free_mem(b2);
    assert(status==MPI_SUCCESS);

    status = MPI_Free_mem(b1);
    assert(status==MPI_SUCCESS);

    status = MPI_Free_mem(m1);
    assert(status==MPI_SUCCESS);

    MPI_Barrier(MPI_COMM_WORLD);

    if (rank == 0) printf("%d: MPI_Finalize\n", rank);
    MPI_Finalize();

    return (0);
}
int main(int argc, char **argv) {
    int i, j, rank, nranks, peer, bufsize, errors;
    double *win_buf, *loc_buf;
    MPI_Win buf_win;

    MPI_Aint idx_loc[SUB_YDIM];
    int idx_rem[SUB_YDIM];
    int blk_len[SUB_YDIM];
    MPI_Datatype loc_type, rem_type;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    bufsize = XDIM * YDIM * sizeof(double);
    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &loc_buf);

    if (rank == 0)
        if (verbose) printf("MPI RMA Strided Get Test:\n");

    for (i = 0; i < XDIM*YDIM; i++)
        *(win_buf + i) = 1.0 + rank;

    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);

    peer = (rank+1) % nranks;

    /* Build the datatype */

    for (i = 0; i < SUB_YDIM; i++) {
      MPI_Get_address(&loc_buf[i*XDIM], &idx_loc[i]);
      idx_rem[i] = i*XDIM;
      blk_len[i] = SUB_XDIM;
    }

    MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &loc_type);
    MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &rem_type);

    MPI_Type_commit(&loc_type);
    MPI_Type_commit(&rem_type);

    /* Perform get operation */

    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);

    MPI_Get(loc_buf, 1, loc_type, peer, 0, 1, rem_type, buf_win);

    /* Use the datatype only on the remote side (must have SUB_XDIM == XDIM) */
    /* MPI_Get(loc_buf, SUB_XDIM*SUB_YDIM, MPI_DOUBLE, peer, 0, 1, rem_type, buf_win); */

    MPI_Win_unlock(peer, buf_win);

    MPI_Type_free(&loc_type);
    MPI_Type_free(&rem_type);

    MPI_Barrier(MPI_COMM_WORLD);

    /* Verify that the results are correct */

    errors = 0;
    for (i = 0; i < SUB_XDIM; i++) {
      for (j = 0; j < SUB_YDIM; j++) {
        const double actual   = *(loc_buf + i + j*XDIM);
        const double expected = (1.0 + peer);
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);
          errors++;
          fflush(stdout);
        }
      }
    }
    for (i = SUB_XDIM; i < XDIM; i++) {
      for (j = 0; j < SUB_YDIM; j++) {
        const double actual   = *(loc_buf + i + j*XDIM);
        const double expected = 1.0 + rank;
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);
          errors++;
          fflush(stdout);
        }
      }
    }
    for (i = 0; i < XDIM; i++) {
      for (j = SUB_YDIM; j < YDIM; j++) {
        const double actual   = *(loc_buf + i + j*XDIM);
        const double expected = 1.0 + rank;
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);
          errors++;
          fflush(stdout);
        }
      }
    }

    MPI_Win_free(&buf_win);
    MPI_Free_mem(win_buf);
    MPI_Free_mem(loc_buf);

    MPI_Finalize();

    if (errors == 0) {
      if (rank == 0) 
        printf(" No Errors\n");
      return 0;
    } else {
      printf("%d: Fail\n", rank);
      return 1;
    }
}
Exemplo n.º 12
0
int main(int argc, char *argv[])
{
    int rank, nprocs, i, j, k;
    int errs = 0;
    MPI_Datatype types[4];

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (nprocs < 2) {
        printf("Run this program with 2 or more processes\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    /* types[0] is of zero size.  Everything else is non-zero size */
    MPI_Type_contiguous(0, MPI_INT, &types[0]);
    MPI_Type_commit(&types[0]);

    MPI_Type_contiguous(1, MPI_INT, &types[1]);
    MPI_Type_commit(&types[1]);

    types[2] = MPI_INT;
    types[3] = MPI_DOUBLE;

    MPI_Win_create(win_buf, SIZE * sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win);

    MPI_Win_fence(0, win);

    if (rank == 0) {
        /* zero-count */
        for (i = 0; i < 4; i++)
            for (j = 0; j < 4; j++)
                for (k = 0; k < 4; k++)
                    do_test(0, types[i], 0, types[j], 0, types[k]);

        /* single zero-size datatype, but non-zero count */
        for (i = 1; i < 4; i++) {
            for (j = 1; j < 4; j++) {
                do_test(1, types[0], 0, types[i], 0, types[j]);
                do_test(0, types[i], 1, types[0], 0, types[j]);
                do_test(0, types[i], 0, types[j], 1, types[0]);
            }
        }

        /* two zero-size datatypes, but non-zero count */
        for (i = 1; i < 4; i++) {
            do_test(1, types[0], 1, types[0], 0, types[i]);
            do_test(1, types[0], 0, types[i], 1, types[0]);
            do_test(0, types[i], 1, types[0], 1, types[0]);

            do_test(1, types[0], 2, types[0], 0, types[i]);
            do_test(2, types[0], 1, types[0], 0, types[i]);

            do_test(1, types[0], 0, types[i], 2, types[0]);
            do_test(2, types[0], 0, types[i], 1, types[0]);

            do_test(0, types[i], 1, types[0], 2, types[0]);
            do_test(0, types[i], 2, types[0], 1, types[0]);
        }

        /* three zero-size datatypes, but non-zero count */
        do_test(1, types[0], 1, types[0], 1, types[0]);
        do_test(1, types[0], 1, types[0], 2, types[0]);
        do_test(1, types[0], 2, types[0], 1, types[0]);
        do_test(1, types[0], 2, types[0], 2, types[0]);
        do_test(2, types[0], 1, types[0], 1, types[0]);
        do_test(2, types[0], 1, types[0], 2, types[0]);
        do_test(2, types[0], 2, types[0], 1, types[0]);
    }
    MPI_Win_fence(0, win);

    MPI_Win_free(&win);
    MPI_Type_free(&types[0]);
    MPI_Type_free(&types[1]);

    if (!errs && !rank)
        printf(" No Errors\n");

    MPI_Finalize();

    return 0;
}
Exemplo n.º 13
0
int main(int argc, char ** argv) {
 
  int    Num_procs;       /* number of ranks                                     */
  int    Num_procsx, 
         Num_procsy;      /* number of ranks in each coord direction             */
  int    Num_groupsx, 
         Num_groupsy;     /* number of blocks in each coord direction            */
  int    my_group;        /* sequence number of shared memory block              */
  int    my_group_IDx,
         my_group_IDy;    /* coordinates of block within block grid              */
  int    group_size;      /* number of ranks in shared memory group              */
  int    group_sizex,
         group_sizey;     /* number of ranks in block in each coord direction    */
  int    my_ID;           /* MPI rank                                            */
  int    my_global_IDx, 
         my_global_IDy;   /* coordinates of rank in overall rank grid            */
  int    my_local_IDx, 
         my_local_IDy;    /* coordinates of rank within shared memory block      */
  int    right_nbr;       /* global rank of right neighboring tile               */
  int    left_nbr;        /* global rank of left neighboring tile                */
  int    top_nbr;         /* global rank of top neighboring tile                 */
  int    bottom_nbr;      /* global rank of bottom neighboring tile              */
  int    local_nbr[4];    /* list of synchronizing local neighbors               */
  int    num_local_nbrs;  /* number of synchronizing local neighbors             */
  int    dummy;
  DTYPE *top_buf_out;     /* communication buffer                                */
  DTYPE *top_buf_in;      /*       "         "                                   */
  DTYPE *bottom_buf_out;  /*       "         "                                   */
  DTYPE *bottom_buf_in;   /*       "         "                                   */
  DTYPE *right_buf_out;   /*       "         "                                   */
  DTYPE *right_buf_in;    /*       "         "                                   */
  DTYPE *left_buf_out;    /*       "         "                                   */
  DTYPE *left_buf_in;     /*       "         "                                   */
  int    root = 0;
  long   n, width, height;/* linear global and block grid dimension              */
  int    width_rank, 
         height_rank;     /* linear local dimension                              */
  int    i, j, ii, jj, kk, it, jt, iter, leftover;  /* dummies                   */
  int    istart_rank, 
         iend_rank;       /* bounds of grid tile assigned to calling rank        */
  int    jstart_rank, 
         jend_rank;       /* bounds of grid tile assigned to calling rank        */
  int    istart, iend;    /* bounds of grid block containing tile                */
  int    jstart, jend;    /* bounds of grid block containing tile                */
  DTYPE  norm,            /* L1 norm of solution                                 */
         local_norm,      /* contribution of calling rank to L1 norm             */
         reference_norm;  /* value to be matched by computed norm                */
  DTYPE  f_active_points; /* interior of grid with respect to stencil            */
  DTYPE  flops;           /* floating point ops per iteration                    */
  int    iterations;      /* number of times to run the algorithm                */
  double local_stencil_time,/* timing parameters                                 */
         stencil_time,
         avgtime; 
  int    stencil_size;    /* number of points in stencil                         */
  DTYPE  * RESTRICT in;   /* input grid values                                   */
  DTYPE  * RESTRICT out;  /* output grid values                                  */
  long   total_length_in; /* total required length to store input array          */
  long   total_length_out;/* total required length to store output array         */
  int    error=0;         /* error flag                                          */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil     */
  MPI_Request request[8]; /* requests for sends & receives in 4 coord directions */
  MPI_Status  status[8];  /* corresponding statuses                              */
  MPI_Win shm_win_in;     /* shared memory window object for IN array            */
  MPI_Win shm_win_out;    /* shared memory window object for OUT array           */
  MPI_Comm shm_comm_prep; /* preparatory shared memory communicator              */
  MPI_Comm shm_comm;      /* Shared Memory Communicator                          */
  int shm_procs;          /* # of rankes in shared domain                        */
  int shm_ID;             /* MPI rank in shared memory domain                    */
  MPI_Aint size_in;       /* size of the IN array in shared memory window        */
  MPI_Aint size_out;      /* size of the OUT array in shared memory window       */
  int size_mul;           /* one for shm_comm root, zero for the other ranks     */
  int disp_unit;          /* ignored                                             */
 
  /*******************************************************************************
  ** Initialize the MPI environment
  ********************************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);
 
  /*******************************************************************************
  ** process, test, and broadcast input parameters    
  ********************************************************************************/
 
  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPI+SHM stencil execution on 2D grid\n");

#ifndef STAR
      printf("ERROR: Compact stencil not supported\n");
      error = 1;
      goto ENDOFTESTS;
#endif
    
    if (argc != 4){
      printf("Usage: %s  <#ranks per coherence domain><# iterations> <array dimension> \n", 
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }
 
    group_size = atoi(*++argv);
    if (group_size < 1) {
      printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size);
      error = 1;
      goto ENDOFTESTS;
    } 
    if (Num_procs%group_size) {
      printf("ERROR: total # %d ranks not divisible by ranks per coherence domain %d\n",
	     Num_procs, group_size);
      error = 1;
      goto ENDOFTESTS;
    } 

    iterations  = atoi(*++argv); 
    if (iterations < 0){
      printf("ERROR: iterations must be >= 0 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    n  = atol(*++argv);
    long nsquare = n * n;
    if (nsquare < Num_procs){ 
      printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare);
      error = 1;
      goto ENDOFTESTS;
    }
 
    if (RADIUS < 0) {
      printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    if (2*RADIUS +1 > n) {
      printf("ERROR: Stencil radius %d exceeds grid size %ld\n", RADIUS, n);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    ENDOFTESTS:;  
  }
  bail_out(error);

  MPI_Bcast(&n,          1, MPI_LONG, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD);
 
  /* determine best way to create a 2D grid of ranks (closest to square, for 
     best surface/volume ratio); we do this brute force for now. The 
     decomposition needs to be such that shared memory groups can evenly
     tessellate the rank grid
  */
  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      for (group_sizex=(int)(sqrt(group_size+1)); group_sizex>0; group_sizex--) {
        if (!(group_size%group_sizex) && !(Num_procsx%group_sizex)) {
          group_sizey=group_size/group_sizex;
          break;
        }
      }
      if (!(Num_procsy%group_sizey)) break;
    }
  }      


  if (my_ID == root) {
    printf("Number of ranks                 = %d\n", Num_procs);
    printf("Grid size                       = %ld\n", n);
    printf("Radius of stencil               = %d\n", RADIUS);
    printf("Tiles in x/y-direction          = %d/%d\n", Num_procsx, Num_procsy);
    printf("Tiles per shared memory domain  = %d\n", group_size);
    printf("Tiles in x/y-direction in group = %d/%d\n", group_sizex,  group_sizey);
    printf("Type of stencil                 = star\n");
#ifdef LOCAL_BARRIER_SYNCH
    printf("Local synchronization           = barrier\n");
#else
    printf("Local synchronization           = point to point\n");
#endif
#ifdef DOUBLE
    printf("Data type                       = double precision\n");
#else
    printf("Data type                       = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
    printf("Number of iterations            = %d\n", iterations);
  }

  /* Setup for Shared memory regions */

  /* first divide WORLD in groups of size group_size */
  MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep);
  /* derive from that an SHM communicator */
  MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm);
  MPI_Comm_rank(shm_comm, &shm_ID);
  MPI_Comm_size(shm_comm, &shm_procs);
  /* do sanity check, making sure groups did not shrink in second comm split */
  if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666);
  
  Num_groupsx = Num_procsx/group_sizex;
  Num_groupsy = Num_procsy/group_sizey;

  my_group = my_ID/group_size;
  my_group_IDx = my_group%Num_groupsx;
  my_group_IDy = my_group/Num_groupsx;
  my_local_IDx = my_ID%group_sizex;
  my_local_IDy = (my_ID%group_size)/group_sizex;
  my_global_IDx = my_group_IDx*group_sizex+my_local_IDx;
  my_global_IDy = my_group_IDy*group_sizey+my_local_IDy;

  /* set all neighboring ranks to -1 (no communication with those ranks) */
  left_nbr = right_nbr = top_nbr = bottom_nbr = -1;
  /* keep track of local neighbors for local synchronization             */
  num_local_nbrs = 0;

  if (my_local_IDx == group_sizex-1 && my_group_IDx != (Num_groupsx-1)) {
    right_nbr = (my_group+1)*group_size+shm_ID-group_sizex+1;
  }
  if (my_local_IDx != group_sizex-1) {
    local_nbr[num_local_nbrs++] = shm_ID + 1;
  }

  if (my_local_IDx == 0 && my_group_IDx != 0) {
    left_nbr = (my_group-1)*group_size+shm_ID+group_sizex-1;
  }
  if (my_local_IDx != 0) {
    local_nbr[num_local_nbrs++] = shm_ID - 1;
  }

  if (my_local_IDy == group_sizey-1 && my_group_IDy != (Num_groupsy-1)) {
    top_nbr = (my_group+Num_groupsx)*group_size + my_local_IDx;
  }
  if (my_local_IDy != group_sizey-1) {
    local_nbr[num_local_nbrs++] = shm_ID + group_sizex;
  }

  if (my_local_IDy == 0 && my_group_IDy != 0) {
    bottom_nbr = (my_group-Num_groupsx)*group_size + group_sizex*(group_sizey-1)+my_local_IDx;
  }
  if (my_local_IDy != 0) {
    local_nbr[num_local_nbrs++] = shm_ID - group_sizex;
  }

  /* compute amount of space required for input and solution arrays for the block,
     and also compute index sets                                                  */
  
  width = n/Num_groupsx;
  leftover = n%Num_groupsx;
  if (my_group_IDx<leftover) {
    istart = (width+1) * my_group_IDx; 
    iend = istart + width;
  }
  else {
    istart = (width+1) * leftover + width * (my_group_IDx-leftover);
    iend = istart + width - 1;
  }
  
  width = iend - istart + 1;
  if (width == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  height = n/Num_groupsy;
  leftover = n%Num_groupsy;
  if (my_group_IDy<leftover) {
    jstart = (height+1) * my_group_IDy; 
    jend = jstart + height;
  }
  else {
    jstart = (height+1) * leftover + height * (my_group_IDy-leftover);
    jend = jstart + height - 1;
  }
  
  height = jend - jstart + 1;
  if (height == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  if (width < RADIUS || height < RADIUS) {
    printf("ERROR: rank %d has work tile smaller then stencil radius; w=%ld,h=%ld\n",
           my_ID, width, height);
    error = 1;
  }
  bail_out(error);
 
  total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE);
  total_length_out = width*height*sizeof(DTYPE);

  /* only the root of each SHM domain specifies window of nonzero size */
  size_mul = (shm_ID==0);  
  size_in= total_length_in*size_mul; 
  MPI_Win_allocate_shared(size_in, sizeof(double), MPI_INFO_NULL, shm_comm, 
                          (void *) &in, &shm_win_in);
  MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_in);
  MPI_Win_shared_query(shm_win_in, MPI_PROC_NULL, &size_in, &disp_unit, (void *)&in);
  if (in == NULL){
    printf("Error allocating space for input array by group %d\n",my_group);
    error = 1;
  }
  bail_out(error);

  size_out= total_length_out*size_mul;
  MPI_Win_allocate_shared(size_out, sizeof(double), MPI_INFO_NULL, shm_comm, 
                          (void *) &out, &shm_win_out);
  MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_out);
  MPI_Win_shared_query(shm_win_out, MPI_PROC_NULL, &size_out, &disp_unit, (void *)&out);
  if (out == NULL){
    printf("Error allocating space for output array by group %d\n", my_group);
    error = 1;
  }
  bail_out(error);

  /* determine index set assigned to each rank                         */

  width_rank = width/group_sizex;
  leftover = width%group_sizex;
  if (my_local_IDx<leftover) {
    istart_rank = (width_rank+1) * my_local_IDx; 
    iend_rank = istart_rank + width_rank;
  }
  else {
    istart_rank = (width_rank+1) * leftover + width_rank * (my_local_IDx-leftover);
    iend_rank = istart_rank + width_rank - 1;
  }
  istart_rank += istart;
  iend_rank += istart;
  width_rank = iend_rank - istart_rank + 1;   

  height_rank = height/group_sizey;
  leftover = height%group_sizey;
  if (my_local_IDy<leftover) {
    jstart_rank = (height_rank+1) * my_local_IDy; 
    jend_rank = jstart_rank + height_rank;
  }
  else {
    jstart_rank = (height_rank+1) * leftover + height_rank * (my_local_IDy-leftover);
    jend_rank = jstart_rank + height_rank - 1;
  }
  jstart_rank+=jstart;
  jend_rank+=jstart;
  height_rank = jend_rank - jstart_rank + 1;

  if (height_rank*width_rank==0) {
    error = 1;
    printf("Rank %d has no work to do\n", my_ID);
  }
  bail_out(error);

  /* allocate communication buffers for halo values                            */
  top_buf_out = (DTYPE *) malloc(4*sizeof(DTYPE)*RADIUS*width_rank);
  if (!top_buf_out) {
    printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  top_buf_in     = top_buf_out +   RADIUS*width_rank;
  bottom_buf_out = top_buf_out + 2*RADIUS*width_rank;
  bottom_buf_in  = top_buf_out + 3*RADIUS*width_rank;
 
  right_buf_out = (DTYPE *) malloc(4*sizeof(DTYPE)*RADIUS*height_rank);
  if (!right_buf_out) { 
    printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  right_buf_in   = right_buf_out +   RADIUS*height_rank;
  left_buf_out   = right_buf_out + 2*RADIUS*height_rank;
  left_buf_in    = right_buf_out + 3*RADIUS*height_rank;

    /* fill the stencil weights to reflect a discrete divergence operator         */
  for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++)
    WEIGHT(ii,jj) = (DTYPE) 0.0;
  stencil_size = 4*RADIUS+1;
  for (ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }

  norm = (DTYPE) 0.0;
  f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS);
  /* intialize the input and output arrays                                     */
  for (j=jstart_rank; j<=jend_rank; j++) for (i=istart_rank; i<=iend_rank; i++) {
    IN(i,j)  = COEFX*i+COEFY*j;
    OUT(i,j) = (DTYPE)0.0;
  }

  /* LOAD/STORE FENCE */
  MPI_Win_sync(shm_win_in);
  MPI_Win_sync(shm_win_out);
  MPI_Barrier(shm_comm); 

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      MPI_Barrier(MPI_COMM_WORLD);
      local_stencil_time = wtime();
    }

    /* need to fetch ghost point data from neighbors in y-direction                 */
    if (top_nbr != -1) {
      MPI_Irecv(top_buf_in, RADIUS*width_rank, MPI_DTYPE, top_nbr, 101,
                MPI_COMM_WORLD, &(request[1]));
      for (kk=0,j=jend_rank-RADIUS+1; j<=jend_rank; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        top_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(top_buf_out, RADIUS*width_rank,MPI_DTYPE, top_nbr, 99, 
                MPI_COMM_WORLD, &(request[0]));
    }

    if (bottom_nbr != -1) {
      MPI_Irecv(bottom_buf_in,RADIUS*width_rank, MPI_DTYPE, bottom_nbr, 99, 
                MPI_COMM_WORLD, &(request[3]));
      for (kk=0,j=jstart_rank; j<=jstart_rank+RADIUS-1; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        bottom_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(bottom_buf_out, RADIUS*width_rank,MPI_DTYPE, bottom_nbr, 101,
 	  MPI_COMM_WORLD, &(request[2]));
      }

    if (top_nbr != -1) {
      MPI_Wait(&(request[0]), &(status[0]));
      MPI_Wait(&(request[1]), &(status[1]));
      for (kk=0,j=jend_rank+1; j<=jend_rank+RADIUS; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        IN(i,j) = top_buf_in[kk++];
      }
    }

    if (bottom_nbr != -1) {    
      MPI_Wait(&(request[2]), &(status[2]));
      MPI_Wait(&(request[3]), &(status[3]));
      for (kk=0,j=jstart_rank-RADIUS; j<=jstart_rank-1; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        IN(i,j) = bottom_buf_in[kk++];
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

    /* need to fetch ghost point data from neighbors in x-direction                 */
    if (right_nbr != -1) {
      MPI_Irecv(right_buf_in, RADIUS*height_rank, MPI_DTYPE, right_nbr, 1010,
                MPI_COMM_WORLD, &(request[1+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=iend_rank-RADIUS+1; i<=iend_rank; i++) {
        right_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(right_buf_out, RADIUS*height_rank, MPI_DTYPE, right_nbr, 990, 
                MPI_COMM_WORLD, &(request[0+4]));
    }

    if (left_nbr != -1) {
      MPI_Irecv(left_buf_in, RADIUS*height_rank, MPI_DTYPE, left_nbr, 990, 
                MPI_COMM_WORLD, &(request[3+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=istart_rank; i<=istart_rank+RADIUS-1; i++) {
        left_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(left_buf_out, RADIUS*height_rank, MPI_DTYPE, left_nbr, 1010,
                MPI_COMM_WORLD, &(request[2+4]));
    }

    if (right_nbr != -1) {
      MPI_Wait(&(request[0+4]), &(status[0+4]));
      MPI_Wait(&(request[1+4]), &(status[1+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=iend_rank+1; i<=iend_rank+RADIUS; i++) {
        IN(i,j) = right_buf_in[kk++];
      }
    }

    if (left_nbr != -1) {
      MPI_Wait(&(request[2+4]), &(status[2+4]));
      MPI_Wait(&(request[3+4]), &(status[3+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=istart_rank-RADIUS; i<=istart_rank-1; i++) {
        IN(i,j) = left_buf_in[kk++];
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

    /* Apply the stencil operator */
    for (j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) {
      for (i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_out);

#ifdef LOCAL_BARRIER_SYNCH
    MPI_Barrier(shm_comm); // needed to avoid writing IN while other ranks are reading it
#else
    for (i=0; i<num_local_nbrs; i++) {
      MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i]));
      MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm);
    }
    MPI_Waitall(num_local_nbrs, request, status);
#endif

    /* add constant to solution to force refresh of neighbor data, if any */
    for (j=jstart_rank; j<=jend_rank; j++) 
    for (i=istart_rank; i<=iend_rank; i++) IN(i,j)+= 1.0;

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

#ifdef LOCAL_BARRIER_SYNCH
    MPI_Barrier(shm_comm); // needed to avoid reading IN while other ranks are writing it
#else
    for (i=0; i<num_local_nbrs; i++) {
      MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i]));
      MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm);
    }
    MPI_Waitall(num_local_nbrs, request, status);
#endif
 
  } /* end of iterations                                                   */
 
  local_stencil_time = wtime() - local_stencil_time;
  MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);
  
  /* compute L1 norm in parallel                                                */
  local_norm = (DTYPE) 0.0;
  for (j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) {
    for (i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) {
      local_norm += (DTYPE)ABS(OUT(i,j));
    }
  }
 
  MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD);
 
  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/
 
/* verify correctness                                                            */
  if (my_ID == root) {
    norm /= f_active_points;
    if (RADIUS > 0) {
      reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY);
    }
    else {
      reference_norm = (DTYPE) 0.0;
    }
    if (ABS(norm-reference_norm) > EPSILON) {
      printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n",
             norm, reference_norm);
      error = 1;
    }
    else {
      printf("Solution validates\n");
#ifdef VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", 
             reference_norm, norm);
#endif
    }
  }
  bail_out(error);
 
  MPI_Win_unlock_all(shm_win_in);
  MPI_Win_unlock_all(shm_win_out);
  MPI_Win_free(&shm_win_in);
  MPI_Win_free(&shm_win_out);

  if (my_ID == root) {
    /* flops/stencil: 2 flops (fma) for each point in the stencil, 
       plus one flop for the update of the input of the array        */
    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = stencil_time/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);
  }
 
  MPI_Finalize();
  exit(EXIT_SUCCESS);
}
Exemplo n.º 14
0
/*Run GET with Post/Start/Complete/Wait */
void run_get_acc_with_pscw(int rank, WINDOW type)
{
    int destrank, size, i;
    MPI_Aint disp = 0;
    MPI_Win     win;

    MPI_Group       comm_group, group;
    MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &comm_group));

    for (size = 0; size <= MAX_SIZE; size = (size ? size * 2 : 1)) {
        allocate_memory(rank, rbuf, size, type, &win);

        if (type == WIN_DYNAMIC) {
            disp = sdisp_remote;
        }

        if (size > LARGE_MESSAGE_SIZE) {
            loop = LOOP_LARGE;
            skip = SKIP_LARGE;
        }
        
        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

        if (rank == 0) {
            destrank = 1;

            MPI_CHECK(MPI_Group_incl(comm_group, 1, &destrank, &group));
            MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

            for (i = 0; i < skip + loop; i++) {
                MPI_CHECK(MPI_Win_start (group, 0, win));
                if (i == skip) {
                    t_start = MPI_Wtime ();
                }
                MPI_CHECK(MPI_Get_accumulate(sbuf, size, MPI_CHAR, cbuf, size, MPI_CHAR, 1, disp, size,
                    MPI_CHAR, MPI_SUM, win));
                MPI_CHECK(MPI_Win_complete(win));
                MPI_CHECK(MPI_Win_post(group, 0, win));
                MPI_CHECK(MPI_Win_wait(win));
            }

            t_end = MPI_Wtime ();
        } else {
            /* rank=1 */
            destrank = 0;

            MPI_CHECK(MPI_Group_incl(comm_group, 1, &destrank, &group));
            MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

            for (i = 0; i < skip + loop; i++) {
                MPI_CHECK(MPI_Win_post(group, 0, win));
                MPI_CHECK(MPI_Win_wait(win));
                MPI_CHECK(MPI_Win_start(group, 0, win));
                MPI_CHECK(MPI_Get_accumulate(sbuf, size, MPI_CHAR, cbuf, size, MPI_CHAR, 0, disp, size,
                    MPI_CHAR, MPI_SUM, win));
                MPI_CHECK(MPI_Win_complete(win));
            }
        }

        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

        if (rank == 0) {
            fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH,
                    FLOAT_PRECISION, (t_end - t_start) * 1.0e6 / loop / 2);
            fflush(stdout);
        }

        MPI_CHECK(MPI_Group_free(&group));

        MPI_Win_free(&win);
    }

    MPI_CHECK(MPI_Group_free(&comm_group));
}
Exemplo n.º 15
0
int main(int argc, char *argv[])
{
    int errs = 0, err;
    int rank, size, source, dest;
    int minsize = 2, count;
    MPI_Comm comm;
    MPI_Win win;
    MPI_Aint extent;
    MTestDatatype sendtype, recvtype;

    MTest_Init(&argc, &argv);

    /* The following illustrates the use of the routines to
     * run through a selection of communicators and datatypes.
     * Use subsets of these for tests that do not involve combinations
     * of communicators, datatypes, and counts of datatypes */
    while (MTestGetIntracommGeneral(&comm, minsize, 1)) {
        if (comm == MPI_COMM_NULL)
            continue;
        /* Determine the sender and receiver */
        MPI_Comm_rank(comm, &rank);
        MPI_Comm_size(comm, &size);
        source = 0;
        dest = size - 1;

        MTEST_DATATYPE_FOR_EACH_COUNT(count) {
            while (MTestGetDatatypes(&sendtype, &recvtype, count)) {
                /* Make sure that everyone has a recv buffer */
                recvtype.InitBuf(&recvtype);

                MPI_Type_extent(recvtype.datatype, &extent);
                MPI_Win_create(recvtype.buf, recvtype.count * extent,
                               (int) extent, MPI_INFO_NULL, comm, &win);
                MPI_Win_fence(0, win);
                if (rank == source) {
                    sendtype.InitBuf(&sendtype);

                    /* To improve reporting of problems about operations, we
                     * change the error handler to errors return */
                    MPI_Win_set_errhandler(win, MPI_ERRORS_RETURN);

                    /* MPI_REPLACE on accumulate is almost the same
                     * as MPI_Put; the only difference is in the
                     * handling of overlapping accumulate operations,
                     * which are not tested here */
                    err = MPI_Accumulate(sendtype.buf, sendtype.count,
                                         sendtype.datatype, dest, 0,
                                         recvtype.count, recvtype.datatype, MPI_REPLACE, win);
                    if (err) {
                        errs++;
                        if (errs < 10) {
                            printf("Accumulate types: send %s, recv %s\n",
                                   MTestGetDatatypeName(&sendtype),
                                   MTestGetDatatypeName(&recvtype));
                            MTestPrintError(err);
                        }
                    }
                    err = MPI_Win_fence(0, win);
                    if (err) {
                        errs++;
                        if (errs < 10) {
                            MTestPrintError(err);
                        }
                    }
                }
                else if (rank == dest) {
                    MPI_Win_fence(0, win);
                    /* This should have the same effect, in terms of
                     * transfering data, as a send/recv pair */
                    err = MTestCheckRecv(0, &recvtype);
                    if (err) {
                        errs += err;
                    }
                }
                else {
                    MPI_Win_fence(0, win);
                }
                MPI_Win_free(&win);
                MTestFreeDatatype(&sendtype);
                MTestFreeDatatype(&recvtype);
            }
        }
        MTestFreeComm(&comm);
    }

    MTest_Finalize(errs);
    MPI_Finalize();
    return 0;
}
Exemplo n.º 16
0
int main(int argc, char *argv[])
{
    int errs = 0;
    int rank, size;
    int minsize = 2, count;
    MPI_Comm comm;
    MPI_Win win;
    MPI_Aint lb, extent;
    MTestDatatype sendtype, recvtype;

    MTest_Init(&argc, &argv);

    while (MTestGetIntracommGeneral(&comm, minsize, 1)) {
        if (comm == MPI_COMM_NULL)
            continue;

        MPI_Comm_rank(comm, &rank);
        MPI_Comm_size(comm, &size);
        int source = 0;

        MTEST_DATATYPE_FOR_EACH_COUNT(count) {
            while (MTestGetDatatypes(&sendtype, &recvtype, count)) {
                recvtype.printErrors = 1;
                recvtype.InitBuf(&recvtype);
                MPI_Type_get_extent(recvtype.datatype, &lb, &extent);

                MPI_Win_create(recvtype.buf, lb + recvtype.count * extent,
                               (int) extent, MPI_INFO_NULL, comm, &win);
                if (rank == source) {
                    int dest;
                    sendtype.InitBuf(&sendtype);

                    MPI_Win_lock_all(0, win);
                    for (dest = 0; dest < size; dest++)
                        if (dest != source) {
                            MPI_Accumulate(sendtype.buf, sendtype.count,
                                           sendtype.datatype, dest, 0,
                                           recvtype.count, recvtype.datatype, MPI_REPLACE, win);
                        }
                    MPI_Win_unlock_all(win);
                    MPI_Barrier(comm);

                    char *resbuf = (char *) calloc(lb + extent * recvtype.count, sizeof(char));

                    /*wait for the destinations to finish checking and reinitializing the buffers */
                    MPI_Barrier(comm);

                    MPI_Win_lock_all(0, win);
                    for (dest = 0; dest < size; dest++)
                        if (dest != source) {
                            MPI_Get_accumulate(sendtype.buf, sendtype.count,
                                               sendtype.datatype, resbuf, recvtype.count,
                                               recvtype.datatype, dest, 0, recvtype.count,
                                               recvtype.datatype, MPI_REPLACE, win);

                        }
                    MPI_Win_unlock_all(win);
                    MPI_Barrier(comm);
                    free(resbuf);
                }
                else {
                    int err;
                    MPI_Barrier(comm);
                    MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win);
                    err = MTestCheckRecv(0, &recvtype);
                    if (err)
                        errs++;
                    recvtype.InitBuf(&recvtype);
                    MPI_Win_unlock(rank, win);

                    /*signal the source that checking and reinitialization is done */
                    MPI_Barrier(comm);

                    MPI_Barrier(comm);
                    MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win);
                    err = MTestCheckRecv(0, &recvtype);
                    if (err)
                        errs++;
                    MPI_Win_unlock(rank, win);
                }

                MPI_Win_free(&win);
                MTestFreeDatatype(&sendtype);
                MTestFreeDatatype(&recvtype);
            }
        }
        MTestFreeComm(&comm);
    }
    MTest_Finalize(errs);
    MPI_Finalize();
    return 0;
}
Exemplo n.º 17
0
int main( int argc, char *argv[] )
{

    int rank, destrank, nprocs, i;
    MPI_Group comm_group, group;
    MPI_Win win;

    int loop;
    int size;
    double t_start, t_end;

    int count, align_size;
    int * s_buf;
    int * r_buf;
 
    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
    MPI_Comm_group(MPI_COMM_WORLD, &comm_group);

    loop        = LOOP;
    align_size  = MESSAGE_ALIGNMENT;

    s_buf =
        (int *) (((unsigned long) s_buf_original + (align_size - 1)) /
                  align_size * align_size);
    r_buf =
        (int *) (((unsigned long) r_buf_original + (align_size - 1)) /
                  align_size * align_size);

    for (i=0; i < MAX_SIZE/sizeof(int); i++)
    {
      r_buf[i] = i;
      s_buf[i] = 2*i;

    }

    if (rank == 0) {
        fprintf(stdout, "# OSU MPI2 Accumulate Latency Test (Version 1.0)\n");
        fprintf(stdout, "# Size\t\tLatency (us) \n");
    }

    for (count = 0; count <= MAX_SIZE/sizeof(int);
         count = (count ? count * 2 : count + 1)) {
      size = count * sizeof(int);
      if (rank == 0)
      {
            MPI_Win_create(r_buf, size, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win);
            destrank = 1;
            MPI_Group_incl(comm_group, 1, &destrank, &group);
            MPI_Barrier( MPI_COMM_WORLD);
      
            for (i=0;i<SKIP+loop;i++)
            {
                MPI_Win_start(group, 0, win);
                if (i==SKIP) t_start=MPI_Wtime();
                MPI_Accumulate(s_buf, count, MPI_INT, 1, 0, count, MPI_INT,MPI_SUM, win);
                MPI_Win_complete(win);
                MPI_Win_post(group, 0, win);
                MPI_Win_wait(win);
            }
            t_end=MPI_Wtime();
            MPI_Win_free(&win);

      }
      else
      {
            MPI_Win_create(r_buf, size, 1, MPI_INFO_NULL,MPI_COMM_WORLD, &win);
            destrank = 0;
            MPI_Group_incl(comm_group, 1, &destrank, &group);
            MPI_Barrier( MPI_COMM_WORLD);

            for (i=0;i<SKIP+loop;i++)
            {
                MPI_Win_post(group, 0, win);
                MPI_Win_wait(win);
                MPI_Win_start(group, 0, win);
                MPI_Accumulate(s_buf, count, MPI_INT, 0, 0, count, MPI_INT,MPI_SUM, win);
                MPI_Win_complete(win);

            }
            MPI_Win_free(&win);

      }

      if(rank == 0)
      {
        printf("%d\t\t%f\n",size, (t_end-t_start)*1.0e6/loop/2);
        fflush(stdout);  
      }

   } //end of for loop

  MPI_Finalize();
  return 0; 
}
Exemplo n.º 18
0
int main(int argc, char **argv) {
    int itr, i, j, rank, nranks, peer, bufsize, errors = 0;
    double *buffer, *src_buf;
    MPI_Win buf_win;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    bufsize = XDIM * YDIM * sizeof(double);
    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &buffer);
    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);

    if (rank == 0)
        printf("MPI RMA Strided Accumulate Test:\n");

    for (i = 0; i < XDIM*YDIM; i++) {
        *(buffer  + i) = 1.0 + rank;
        *(src_buf + i) = 1.0 + rank;
    }

    MPI_Win_create(buffer, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);

    peer = (rank+1) % nranks;

    for (itr = 0; itr < ITERATIONS; itr++) {

      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);

      for (j = 0; j < YDIM; j++) {
        MPI_Accumulate(src_buf + j*XDIM, XDIM, MPI_DOUBLE, peer,
                       j*XDIM*sizeof(double), XDIM, MPI_DOUBLE, MPI_SUM, buf_win);
      }

      MPI_Win_unlock(peer, buf_win);
    }

    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
    for (i = 0; i < XDIM; i++) {
      for (j = 0; j < YDIM; j++) {
        const double actual   = *(buffer + i + j*XDIM);
        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);
          errors++;
          fflush(stdout);
        }
      }
    }
    MPI_Win_unlock(rank, buf_win);

    MPI_Win_free(&buf_win);
    MPI_Free_mem(buffer);
    MPI_Free_mem(src_buf);

    MPI_Finalize();

    if (errors == 0) {
      printf("%d: Success\n", rank);
      return 0;
    } else {
      printf("%d: Fail\n", rank);
      return 1;
    }
}
Exemplo n.º 19
0
void run_test(int lock_mode, int lock_assert)
{
    int nproc, test_iter, target_rank, data_size;
    int *buf, *win_buf;
    MPI_Win win;

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);

    if (rank == 0 && verbose) {
        printf("Starting one-sided contiguous performance test with %d processes\n", nproc);

        printf("Synchronization mode: ");

        switch (lock_mode) {
        case MPI_LOCK_EXCLUSIVE:
            printf("Exclusive lock");
            break;
        case MPI_LOCK_SHARED:
            printf("Shared lock");
            break;
        default:
            printf("Unknown lock");
            break;
        }

        if (lock_assert & MPI_MODE_NOCHECK)
            printf(", MPI_MODE_NOCHECK");

        printf("\n");
    }

    MPI_Alloc_mem(MAX_DATA_SIZE, MPI_INFO_NULL, &buf);
    MPI_Alloc_mem(MAX_DATA_SIZE, MPI_INFO_NULL, &win_buf);
    memset(buf, rank, MAX_DATA_SIZE);
    memset(win_buf, rank, MAX_DATA_SIZE);
    MPI_Win_create(win_buf, MAX_DATA_SIZE, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win);

    if (rank == 0 && verbose)
        printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "Trg. Rank", "Xfer Size",
               "Get (usec)", "Put (usec)", "Acc (usec)",
               "Get (MiB/s)", "Put (MiB/s)", "Acc (MiB/s)");

    for (target_rank = 0; rank == 0 && target_rank < nproc; target_rank++) {
        for (data_size = sizeof(double); data_size <= MAX_DATA_SIZE; data_size *= 2) {
            double t_get, t_put, t_acc;
            int num_iter = MAX_NUM_ITERATIONS;

            /* Scale the number of iterations by log_2 of the data size, so
             * that we run each test for a reasonable amount of time. */
            {
                int t = data_size, my_log2 = 0;
                while (t >>= 1)
                    my_log2++;
                if (my_log2)
                    num_iter = (num_iter / my_log2 < MIN_NUM_ITERATIONS) ?
                        MIN_NUM_ITERATIONS : num_iter / my_log2;
            }

            for (test_iter = 0; test_iter < num_iter + NUM_WARMUP_ITER; test_iter++) {
                if (test_iter == NUM_WARMUP_ITER)
                    t_get = MPI_Wtime();

                MPI_Win_lock(lock_mode, target_rank, lock_assert, win);
                MPI_Get(buf, data_size, MPI_BYTE, target_rank, 0, data_size, MPI_BYTE, win);
                MPI_Win_unlock(target_rank, win);
            }
            t_get = (MPI_Wtime() - t_get) / num_iter;

            for (test_iter = 0; test_iter < num_iter + NUM_WARMUP_ITER; test_iter++) {
                if (test_iter == NUM_WARMUP_ITER)
                    t_put = MPI_Wtime();

                MPI_Win_lock(lock_mode, target_rank, lock_assert, win);
                MPI_Put(buf, data_size, MPI_BYTE, target_rank, 0, data_size, MPI_BYTE, win);
                MPI_Win_unlock(target_rank, win);
            }
            t_put = (MPI_Wtime() - t_put) / num_iter;

            for (test_iter = 0; test_iter < num_iter + NUM_WARMUP_ITER; test_iter++) {
                if (test_iter == NUM_WARMUP_ITER)
                    t_acc = MPI_Wtime();

                MPI_Win_lock(lock_mode, target_rank, lock_assert, win);
                MPI_Accumulate(buf, data_size / sizeof(int), MPI_INT, target_rank,
                               0, data_size / sizeof(int), MPI_INT, MPI_SUM, win);
                MPI_Win_unlock(target_rank, win);
            }
            t_acc = (MPI_Wtime() - t_acc) / num_iter;

            if (rank == 0 && verbose)
                printf("%12d %12d %12.3f %12.3f %12.3f %12.3f %12.3f %12.3f\n", target_rank,
                       data_size, t_get * 1.0e6, t_put * 1.0e6, t_acc * 1.0e6,
                       data_size / (1024.0 * 1024.0) / t_get, data_size / (1024.0 * 1024.0) / t_put,
                       data_size / (1024.0 * 1024.0) / t_acc);
        }
    }

    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Win_free(&win);
    MPI_Free_mem(win_buf);
    MPI_Free_mem(buf);
}
Exemplo n.º 20
0
int put_one_to_one(Test_time_result_type *times,int mes_length,int num_repeats)
{
    int i;
    int pair[2];

    int confirmation_flag;

    int send_proc,recv_proc;

    char *data_window=NULL;

    MPI_Win win;
    MPI_Status status;


    data_window=(char *)malloc(mes_length*sizeof(char));
    if(data_window==NULL)
    {
        printf("proc %d from %d: Can not allocate memory %d*sizeof(char)\n",comm_rank,comm_size,mes_length);
        MPI_Abort(MPI_COMM_WORLD,-1);
        return -1;
    }

    /*
     * MPI2 Window creation then all processes will
     * use this window in memory.
     */
    MPI_Win_create(data_window, mes_length*sizeof(char), sizeof(char), MPI_INFO_NULL, MPI_COMM_WORLD, &win);

    if(comm_rank==0)
    {
        for(i=0; i<comm_size*comm_size; i++)
        {
            send_proc=get_send_processor(i,comm_size);
            recv_proc=get_recv_processor(i,comm_size);

            pair[0]=send_proc;
            pair[1]=recv_proc;

            if(send_proc)
                MPI_Send(pair,2,MPI_INT,send_proc,1,MPI_COMM_WORLD);
            if(recv_proc)
                MPI_Send(pair,2,MPI_INT,recv_proc,1,MPI_COMM_WORLD);

            if(send_proc==0)
            {
                times[recv_proc]=real_put_one_to_one(mes_length,num_repeats,send_proc,recv_proc,&win);
            }

            if(send_proc)
            {
                MPI_Recv(&confirmation_flag,1,MPI_INT,send_proc,1,MPI_COMM_WORLD,&status);
            }

            if(recv_proc)
            {
                MPI_Recv(&confirmation_flag,1,MPI_INT,recv_proc,1,MPI_COMM_WORLD,&status);
            }

        } /* End for */
        pair[0]=-1;
        for(i=1; i<comm_size; i++)
            MPI_Send(pair,2,MPI_INT,i,1,MPI_COMM_WORLD);
    } /* end if comm_rank==0 */
    else
    {
        for( ; ; )
        {
            MPI_Recv(pair,2,MPI_INT,0,1,MPI_COMM_WORLD,&status);
            send_proc=pair[0];
            recv_proc=pair[1];

            if(send_proc==-1)
                break;
            if(send_proc==comm_rank)
                times[recv_proc]=real_put_one_to_one(mes_length,num_repeats,send_proc,recv_proc,&win);

            confirmation_flag=1;
            MPI_Send(&confirmation_flag,1,MPI_INT,0,1,MPI_COMM_WORLD);
        }
    } /* end else comm_rank==0 */

    MPI_Win_free(&win);
    free(data_window);

    return 0;
}
Exemplo n.º 21
0
int main(int argc, char *argv[]) 
{ 
    int rank, destrank, nprocs, A[SIZE2], B[SIZE2], i;
    MPI_Comm CommDeuce;
    MPI_Group comm_group, group;
    MPI_Win win;
    int errs = 0;

    MTest_Init(&argc,&argv); 
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 

    if (nprocs < 2) {
        printf("Run this program with 2 or more processes\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);

    if (rank < 2)
    {
        MPI_Comm_group(CommDeuce, &comm_group);

        if (rank == 0) {
            for (i=0; i<SIZE2; i++) A[i] = B[i] = i;
            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
            destrank = 1;
            MPI_Group_incl(comm_group, 1, &destrank, &group);
            MPI_Win_start(group, 0, win);
            for (i=0; i<SIZE1; i++)
                MPI_Put(A+i, 1, MPI_INT, 1, i, 1, MPI_INT, win);
            for (i=0; i<SIZE1; i++)
                MPI_Get(B+i, 1, MPI_INT, 1, SIZE1+i, 1, MPI_INT, win);

            MPI_Win_complete(win);

            for (i=0; i<SIZE1; i++)
                if (B[i] != (-4)*(i+SIZE1)) {
                    printf("Get Error: B[i] is %d, should be %d\n", B[i], (-4)*(i+SIZE1));
                    errs++;
                }
        }
        else if (rank == 1) {
            for (i=0; i<SIZE2; i++) B[i] = (-4)*i;
            MPI_Win_create(B, SIZE2*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
            destrank = 0;
            MPI_Group_incl(comm_group, 1, &destrank, &group);
            MPI_Win_post(group, 0, win);
            MPI_Win_wait(win);

            for (i=0; i<SIZE1; i++) {
                if (B[i] != i) {
                    printf("Put Error: B[i] is %d, should be %d\n", B[i], i);
                    errs++;
                }
            }
        }

        MPI_Group_free(&group);
        MPI_Group_free(&comm_group);
        MPI_Win_free(&win);
    }
    MPI_Comm_free(&CommDeuce);
    MTest_Finalize(errs);
    MPI_Finalize();
    return 0; 
} 
Exemplo n.º 22
0
int main(int argc, char **argv) {
    int           procid, nproc, i, j, my_nelem;
    int           pollint = 0;
    double        time;
    MPI_Win       llist_win;
    llist_ptr_t   head_ptr, tail_ptr;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &procid);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);

    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &llist_win);

    /* Process 0 creates the head node */
    if (procid == 0)
        head_ptr.disp = alloc_elem(procid, llist_win);

    /* Broadcast the head pointer to everyone */
    head_ptr.rank = 0;
    MPI_Bcast(&head_ptr.disp, 1, MPI_AINT, 0, MPI_COMM_WORLD);
    tail_ptr = head_ptr;

    /* All processes append NUM_ELEMS elements to the list; rank 0 has already
     * appended an element. */
    if (procid == 0)
        i = 1;
    else
        i = 0;
    my_nelem = NUM_ELEMS/nproc;
    if (procid < NUM_ELEMS % nproc)
        my_nelem++;

    MPI_Barrier(MPI_COMM_WORLD);
    time = MPI_Wtime();

    for ( ; i < my_nelem; i++) {
        llist_ptr_t new_elem_ptr;
        int success = 0;

        /* Create a new list element and register it with the window */
        new_elem_ptr.rank = procid;
        new_elem_ptr.disp = alloc_elem(procid, llist_win);

        /* Append the new node to the list.  This might take multiple attempts if
           others have already appended and our tail pointer is stale. */
        do {
            int flag;

            /* The tail is at my left neighbor, append my element. */
            if (tail_ptr.rank == (procid + nproc-1) % nproc)
            {
                if (verbose)
                    printf("%d: Appending to <%d, %p>\n", procid, tail_ptr.rank, (void*) tail_ptr.disp);

                MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
#if USE_ACC
                MPI_Accumulate(&new_elem_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
                               (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next), sizeof(llist_ptr_t),
                               MPI_BYTE, MPI_REPLACE, llist_win);
#else
                MPI_Put(&new_elem_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
                        (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next), sizeof(llist_ptr_t),
                        MPI_BYTE, llist_win);
#endif
                MPI_Win_unlock(tail_ptr.rank, llist_win);

                success = 1;
                tail_ptr = new_elem_ptr;
            }

            /* Otherwise, chase the tail. */
            else
            {
                llist_ptr_t next_tail_ptr;

                MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
#if USE_ACC
                MPI_Get_accumulate( NULL, 0, MPI_DATATYPE_NULL, &next_tail_ptr,
                                    sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
                                    (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next),
                                    sizeof(llist_ptr_t), MPI_BYTE, MPI_NO_OP, llist_win);
#else
                MPI_Get(&next_tail_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
                        (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next),
                        sizeof(llist_ptr_t), MPI_BYTE, llist_win);
#endif
                MPI_Win_unlock(tail_ptr.rank, llist_win);

                if (next_tail_ptr.rank != nil.rank) {
                    if (verbose)
                        printf("%d: Chasing to <%d, %p>\n", procid, next_tail_ptr.rank, (void*) next_tail_ptr.disp);
                    tail_ptr = next_tail_ptr;
                    pollint = MAX(MIN_NPROBE, pollint/2);
                }
                else {
                    for (j = 0; j < pollint; j++)
                        MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);

                    pollint = MIN(MAX_NPROBE, pollint*2);
                }
            }
        } while (!success);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    time = MPI_Wtime() - time;

    /* Traverse the list and verify that all processes inserted exactly the correct
       number of elements. */
    if (procid == 0) {
        int  errors    = 0;
        int *counts, count = 0;

        counts = (int*) malloc(sizeof(int) * nproc);
        assert(counts != NULL);

        for (i = 0; i < nproc; i++)
            counts[i] = 0;

        tail_ptr = head_ptr;

        MPI_Win_lock_all(0, llist_win);

        /* Walk the list and tally up the number of elements inserted by each rank */
        while (tail_ptr.disp != nil.disp) {
            llist_elem_t elem;

            MPI_Get(&elem, sizeof(llist_elem_t), MPI_BYTE,
                    tail_ptr.rank, tail_ptr.disp, sizeof(llist_elem_t), MPI_BYTE, llist_win);

            MPI_Win_flush(tail_ptr.rank, llist_win);

            tail_ptr = elem.next;

            assert(elem.value >= 0 && elem.value < nproc);
            counts[elem.value]++;
            count++;

            if (verbose) {
                int last_elem = tail_ptr.disp == nil.disp;
                printf("%2d%s", elem.value, last_elem ? "" : " -> ");
                if (count % ELEM_PER_ROW == 0 && !last_elem)
                    printf("\n");
            }
        }

        MPI_Win_unlock_all(llist_win);

        if (verbose)
          printf("\n\n");

        /* Verify the counts we collected */
        for (i = 0; i < nproc; i++) {
            int expected;

            expected = NUM_ELEMS/nproc;
            if (i < NUM_ELEMS % nproc)
                expected++;

            if (counts[i] != expected) {
                printf("Error: Rank %d inserted %d elements, expected %d\n", i, counts[i], expected);
                errors++;
            }
        }

        printf("%s\n", errors == 0 ? " No Errors" : "FAIL");
        free(counts);
    }

    if (print_perf) {
        double max_time;

        MPI_Reduce(&time, &max_time, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

        if (procid == 0) {
            printf("Total time = %0.2f sec, elem/sec = %0.2f, sec/elem = %0.2f usec\n", max_time, NUM_ELEMS/max_time, max_time/NUM_ELEMS*1.0e6);
        }
    }

    MPI_Win_free(&llist_win);

    /* Free all the elements in the list */
    for ( ; my_elems_count > 0; my_elems_count--)
        MPI_Free_mem(my_elems[my_elems_count-1]);

    MPI_Finalize();
    return 0;
}