Beispiel #1
0
int A1D_Allocate_shared(void * ptrs[], int bytes)
{
    void *  tmp_ptr       = NULL;
    int     max_bytes     = 0;

#ifdef DEBUG_FUNCTION_ENTER_EXIT
    fprintf(stderr,"entering A1D_Allocate_shared(void* ptrs[], int bytes) \n");
#endif

    A1D_Barrier();

#ifdef __CRAYXE
    A1D_Allreduce_max32( bytes, &max_bytes );

    /* allocate memory from symmetric heap */
    tmp_ptr = dmapp_sheap_malloc( (size_t)max_bytes );
    assert(tmp_ptr!=NULL);
#endif

    /* allgather addresses into pointer vector */
    A1D_Allgather( &tmp_ptr, ptrs, sizeof(void*) );

#ifdef DEBUG_FUNCTION_ENTER_EXIT
    fprintf(stderr,"exiting A1D_Allocate_shared(void* ptrs[], int bytes) \n");
#endif

    return(0);
}
Beispiel #2
0
int A1D_Allocate_comm(MPI_Comm comm, void * ptrs[], int bytes)
{
    int mpi_status = MPI_SUCCESS;

    void *  tmp_ptr       = NULL;
    int     max_bytes     = 0;

#ifdef DEBUG_FUNCTION_ENTER_EXIT
    fprintf(stderr,"entering A1D_Allocate_comm(MPI_Comm comm, void * ptrs[], int bytes)\n");
#endif

    mpi_status = MPI_Barrier(comm);
    assert(mpi_status==0);

#ifdef __CRAYXE
    A1D_Allreduce_max32(comm, bytes, &max_bytes );

    /* allocate memory from symmetric heap */
    tmp_ptr = dmapp_sheap_malloc( (size_t)max_bytes );
    assert(tmp_ptr!=NULL);
#endif

    /* allgather addresses into pointer vector */
    A1D_Allgather(comm, &tmp_ptr, ptrs, sizeof(void*) );

#ifdef DEBUG_FUNCTION_ENTER_EXIT
    fprintf(stderr,"exiting A1D_Allocate_comm(MPI_Comm comm, void * ptrs[], int bytes)\n");
#endif

    return(0);
}
Beispiel #3
0
int main(int argc,char **argv)
{
#ifdef __CRAYXE
    int                max;
    int                i;
    int                pe = -1;
    int                npes = -1;
    char *             source = NULL;
    char *             target = NULL;
    dmapp_return_t     status;
    //dmapp_rma_attrs_t  dmapp_config_in, dmapp_config_out;
    dmapp_rma_attrs_ext_t dmapp_config_in, dmapp_config_out;
    dmapp_jobinfo_t    job;
    dmapp_seg_desc_t * seg = NULL;

    double t0, t1, dt;
    double bw;

    MPI_Init(&argc, &argv);

    /* Initialize DMAPP resources before executing any other DMAPP calls. */
    //status = dmapp_init(NULL, &actual_args);

    dmapp_config_in.max_outstanding_nb   = DMAPP_DEF_OUTSTANDING_NB; /*  512 */
    dmapp_config_in.offload_threshold    = DMAPP_OFFLOAD_THRESHOLD;  /* 4096 */

    //dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC;
    //dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC;
    dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE;
    dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE;

    dmapp_config_in.max_concurrency      = 1; /* not thread-safe */

    //dmapp_config_in.PI_ordering          = DMAPP_PI_ORDERING_STRICT;
    dmapp_config_in.PI_ordering          = DMAPP_PI_ORDERING_RELAXED;

    status = dmapp_init_ext( &dmapp_config_in, &dmapp_config_out );
    assert(status==DMAPP_RC_SUCCESS);

    max = (argc>1) ? atoi(argv[1]) : 1000000;
    max *= 16; /* max must be a multiple of 16 for the test to work */

    /* Allocate remotely accessible memory for source and target buffers.
           Only memory in the data segment or the sheap is remotely accessible.
           Here we allocate from the sheap. */
    source = (char *)dmapp_sheap_malloc( max*sizeof(char) );
    target = (char *)dmapp_sheap_malloc( max*sizeof(char) );
    assert( (source!=NULL) && (target!=NULL));

    memset (source,'S',max);
    memset (target,'T',max);

    /* Retrieve information about job details, such as PE id and number of PEs. */
    status = dmapp_get_jobinfo(&job);
    assert(status==DMAPP_RC_SUCCESS);
    pe = job.pe;
    npes = job.npes;

    /* Retrieve information about RMA attributes, such as offload_threshold
           and routing modes. */
    //status = dmapp_get_rma_attrs(&dmapp_config_out);
    status = dmapp_get_rma_attrs_ext(&dmapp_config_out);
    assert(status==DMAPP_RC_SUCCESS);

    /* Specify in which segment the remote memory region (the source) lies.
           In this case, it is the sheap (see above). */
    seg = &(job.sheap_seg);

    if (pe == 0) fprintf(stderr," Hello from PE %d of %d, using seg start %p, seg size 0x%lx, offload_threshold %d \n",
            pe, npes, seg->addr, (unsigned long)seg->len, dmapp_config_out.offload_threshold);
    fflush(stderr);
    PMI_Barrier();

    if (pe == 0)
    {
        fprintf(stderr,"%d: max = %d bytes, dmapp_put using DMAPP_DQW \n", pe, max);
        for (i=1; i<(max/16); i*=2)
        {
            t0 = MPI_Wtime();
            status = dmapp_put(target, seg, 1, source, i, DMAPP_DQW);
            t1 = MPI_Wtime();
            assert(status==DMAPP_RC_SUCCESS);
            dt = t1-t0;
            bw = 16 * 1e-6 * (double)i / dt;
            fprintf(stderr,"%d: %12d bytes %12lf seconds = %lf MB/s \n", pe, 16*i, dt, bw);
        }

    }
    fflush(stderr);
    PMI_Barrier();

    if (pe == 0)
    {
        fprintf(stderr,"%d: max = %d bytes, dmapp_put using DMAPP_QW \n", pe, max);
        for (i=1; i<(max/8); i*=2)
        {
            t0 = MPI_Wtime();
            status = dmapp_put(target, seg, 1, source, i, DMAPP_QW);
            t1 = MPI_Wtime();
            assert(status==DMAPP_RC_SUCCESS);
            dt = t1-t0;
            bw = 8 * 1e-6 * (double)i / dt;
            fprintf(stderr,"%d: %12d bytes %12lf seconds = %lf MB/s \n", pe, 8*i, dt, bw);
        }

    }
    fflush(stderr);
    PMI_Barrier();

    if (pe == 0)
    {
        fprintf(stderr,"%d: max = %d bytes, dmapp_put using DMAPP_DW \n", pe, max);
        for (i=1; i<(max/4); i*=2)
        {
            t0 = MPI_Wtime();
            status = dmapp_put(target, seg, 1, source, i, DMAPP_DW);
            t1 = MPI_Wtime();
            assert(status==DMAPP_RC_SUCCESS);
            dt = t1-t0;
            bw = 4 * 1e-6 * (double)i / dt;
            fprintf(stderr,"%d: %12d bytes %12lf seconds = %lf MB/s \n", pe, 4*i, dt, bw);
        }

    }
    fflush(stderr);
    PMI_Barrier();

    if (pe == 0)
    {
        fprintf(stderr,"%d: max = %d bytes, dmapp_put using DMAPP_BYTE \n", pe, max);
        for (i=1; i<max; i*=2)
        {
            t0 = MPI_Wtime();
            status = dmapp_put(target, seg, 1, source, i, DMAPP_BYTE);
            t1 = MPI_Wtime();
            assert(status==DMAPP_RC_SUCCESS);
            dt = t1-t0;
            bw = 1 * 1e-6 * (double)i / dt;
            fprintf(stderr,"%d: %12d bytes %12lf seconds = %lf MB/s \n", pe, 1*i, dt, bw);
        }

    }
    fflush(stderr);
    PMI_Barrier();

    /* Free buffers allocated from sheap. */
    dmapp_sheap_free(target);
    dmapp_sheap_free(source);

    /* Release DMAPP resources. This is a mandatory call. */
    status = dmapp_finalize();
    assert(status==DMAPP_RC_SUCCESS);

    MPI_Finalize();
#endif
    return(0);
}
Beispiel #4
0
int A1D_Initialize()
{

#ifdef DMAPPD_USES_MPI
    int mpi_initialized, mpi_provided;
    int mpi_status = MPI_SUCCESS;

    int namelen;
    char procname[MPI_MAX_PROCESSOR_NAME];
#endif

#ifdef __CRAYXE
    int                                 pmi_status  = PMI_SUCCESS;
    int                                 nodeid = -1;
    rca_mesh_coord_t                    rca_xyz;

    dmapp_return_t                      dmapp_status = DMAPP_RC_SUCCESS;

    dmapp_rma_attrs_ext_t               dmapp_config_in, dmapp_config_out;

    dmapp_jobinfo_t                     dmapp_info;
    dmapp_pe_t                          dmapp_rank = -1;
    int                                 dmapp_size = -1;
#endif
    int                                 sheapflag = 0;

#ifdef DEBUG_FUNCTION_ENTER_EXIT
    fprintf(stderr,"entering A1D_Initialize() \n");
#endif

#ifdef DMAPPD_USES_MPI

    /***************************************************
     *
     * configure MPI
     *
     ***************************************************/

    /* MPI has to be Initialized for this implementation to work */
    MPI_Initialized(&mpi_initialized);
    assert(mpi_initialized==1);

    /* MPI has to tolerate threads because A1 supports them */
    MPI_Query_thread(&mpi_provided);
    //assert(mpi_provided>MPI_THREAD_SINGLE);

    /* have to use our own communicator for collectives to be proper */
    mpi_status = MPI_Comm_dup(MPI_COMM_WORLD,&A1D_COMM_WORLD);
    assert(mpi_status==0);

    /* get my MPI rank */
    mpi_status = MPI_Comm_rank(A1D_COMM_WORLD,&mpi_rank);
    assert(mpi_status==0);

    /* get MPI world size */
    mpi_status = MPI_Comm_size(A1D_COMM_WORLD,&mpi_size);
    assert(mpi_status==0);

    /* in a perfect world, this would provide topology information like BG */
    MPI_Get_processor_name( procname, &namelen );
    printf( "%d: MPI_Get_processor_name = %s\n" , mpi_rank, procname );
    fflush( stdout );

    /* barrier to make sure MPI is ready everywhere */
    mpi_status = MPI_Barrier(A1D_COMM_WORLD);
    assert(mpi_status==0);

#endif

#ifdef __CRAYXE

    /***************************************************
     *
     * query topology
     *
     ***************************************************/

    PMI_Get_nid( mpi_rank, &nodeid );
    assert(pmi_status==PMI_SUCCESS);

    rca_get_meshcoord((uint16_t)nodeid, &rca_xyz);
    printf("%d: rca_get_meshcoord returns (%2u,%2u,%2u)\n", mpi_rank, rca_xyz.mesh_x, rca_xyz.mesh_y, rca_xyz.mesh_z );

#endif

#ifdef __CRAYXE

    /***************************************************
     *
     * configure DMAPP
     *
     ***************************************************/

    dmapp_config_in.max_outstanding_nb   = DMAPP_DEF_OUTSTANDING_NB; /*  512 */
    dmapp_config_in.offload_threshold    = DMAPP_OFFLOAD_THRESHOLD;  /* 4096 */
#ifdef DETERMINISTIC_ROUTING
    dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC;
    dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC;
#else
    dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE;
    dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE;
#endif
    dmapp_config_in.max_concurrency      = 1; /* not thread-safe */
#ifdef FLUSH_IMPLEMENTED
    dmapp_config_in.PI_ordering          = DMAPP_PI_ORDERING_RELAXED;
#else
    dmapp_config_in.PI_ordering          = DMAPP_PI_ORDERING_STRICT;
#endif

    dmapp_status = dmapp_init_ext( &dmapp_config_in, &dmapp_config_out );
    assert(dmapp_status==DMAPP_RC_SUCCESS);

#ifndef FLUSH_IMPLEMENTED
    /* without strict PI ordering, we have to flush remote stores with a get packet to force global visibility */
    assert( dmapp_config_out.PI_ordering == DMAPP_PI_ORDERING_STRICT);
#endif

    dmapp_status = dmapp_get_jobinfo(&dmapp_info);
    assert(dmapp_status==DMAPP_RC_SUCCESS);

    dmapp_rank     = dmapp_info.pe;
    dmapp_size     = dmapp_info.npes;
    A1D_Sheap_desc = dmapp_info.sheap_seg;

    /* make sure PMI and DMAPP agree */
    assert(mpi_rank==dmapp_rank);
    assert(mpi_size==dmapp_size);

#endif

    /***************************************************
     *
     * setup protocols
     *
     ***************************************************/

#ifdef FLUSH_IMPLEMENTED
    /* allocate Put list */
    A1D_Put_flush_list = malloc( mpi_size * sizeof(int32_t) );
    assert(A1D_Put_flush_list != NULL);
#endif

#ifdef __CRAYXE
    A1D_Acc_lock = dmapp_sheap_malloc( sizeof(int64_t) );
#endif

    A1D_Allreduce_issame64((size_t)A1D_Acc_lock, &sheapflag);
    assert(sheapflag==1);

#ifdef DEBUG_FUNCTION_ENTER_EXIT
    fprintf(stderr,"exiting A1D_Initialize() \n");
#endif

    return(0);
}
Beispiel #5
0
int main(int argc, char **argv)
{
#ifdef __CRAYXE
    int i,j;
    int me = -1;
    int size = -1;
    //int fail_count = 0;

    dmapp_return_t status;
    dmapp_rma_attrs_t actual_args = { 0 }, rma_args = { 0 };
    dmapp_jobinfo_t job;
    dmapp_seg_desc_t *seg = NULL;

    /* Set the RMA parameters. */
    rma_args.put_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE;
    rma_args.max_outstanding_nb = DMAPP_DEF_OUTSTANDING_NB;
    rma_args.offload_threshold = DMAPP_OFFLOAD_THRESHOLD;
    rma_args.max_concurrency = 1;

    /* Initialize DMAPP. */
    status = dmapp_init(&rma_args, &actual_args);
    assert(status==DMAPP_RC_SUCCESS);

    /* Get job related information. */
    status = dmapp_get_jobinfo(&job);
    assert(status==DMAPP_RC_SUCCESS);

    me = job.pe;
    size = job.npes;
    seg = &(job.sheap_seg);

    /* Allocate and initialize the source and target arrays. */
    long * source = (long *) dmapp_sheap_malloc( size * sizeof(long) );
    assert(source!=NULL);
    long * target = (long *) dmapp_sheap_malloc( size * sizeof(long) );
    assert(target!=NULL);

    for (i = 0; i < size; i++) source[i] = 0;
    for (i = 0; i < size; i++) target[i] = 0;

    /* Wait for all PEs to complete array initialization. */
    PMI_Barrier();

    /* compare-and-swap */
    //
    // dmapp_return_t dmapp_acswap_qw(
    //   IN void             *target_addr /* local memory */,
    //   IN void             *source_addr /* remote memory */,
    //   IN dmapp_seg_desc_t *source_seg  /* remote segment */,
    //   IN dmapp_pe_t        source_pe   /* remote rank */,
    //   IN int64_t           comperand,
    //   IN int64_t           swaperand);
    //
    for (i = 0; i < size; i++)
        if (i != me)
        {
            status = dmapp_acswap_qw(&source[i], &target[i], seg, (dmapp_pe_t)i, (int64_t)0, (int64_t)me);
            if (status==DMAPP_RC_SUCCESS)                printf("%d: DMAPP_RC_SUCCESS\n",me);
            else if (status==DMAPP_RC_INVALID_PARAM)     printf("%d: DMAPP_RC_INVALID_PARAM\n",me);
            else if (status==DMAPP_RC_ALIGNMENT_ERROR)   printf("%d: DMAPP_RC_ALIGNMENT_ERROR\n",me);
            else if (status==DMAPP_RC_NO_SPACE)          printf("%d: DMAPP_RC_NO_SPACE\n",me);
            else if (status==DMAPP_RC_TRANSACTION_ERROR) printf("%d: DMAPP_RC_TRANSACTION_ERROR\n",me);
            fflush(stdout);
            assert(status==DMAPP_RC_SUCCESS);
        }

    /* Wait for all PEs. */
    PMI_Barrier();

    /* see who won */
    for (i = 0; i < size; i++)
    {
        if (i==me)
        {
            for (j = 0; j < size; j++) printf("me = %d target[%d] = %ld\n", me, i, target[i] );
            printf("==========================================\n");
            fflush(stdout);
        }
        PMI_Barrier();
    }

    /* Finalize. */
    status = dmapp_finalize();
    assert(status==DMAPP_RC_SUCCESS);

#endif
    return(0);
}
Beispiel #6
0
int main(int argc,char **argv)
{
#ifdef __CRAYXE
        int               nelems = 128;
        int               i;
        int               pe = -1;
        int               npes = -1;
        int               fail_count = 0;
        long              *source = NULL;
        long              *target = NULL;
        dmapp_return_t    status;
        dmapp_rma_attrs_t actual_args;
        dmapp_jobinfo_t   job;
        dmapp_seg_desc_t  *seg = NULL;

        /* Initialize DMAPP resources before executing any other DMAPP calls. */
        status = dmapp_init(NULL, &actual_args);
        if (status != DMAPP_RC_SUCCESS) {
                fprintf(stderr,"\n dmapp_init FAILED: %d\n", status);
                exit(1);
        }

        /* Allocate remotely accessible memory for source and target buffers.
           Only memory in the data segment or the sheap is remotely accessible.
           Here we allocate from the sheap. */
        source = (long *)dmapp_sheap_malloc(nelems*sizeof(long));
        target = (long *)dmapp_sheap_malloc(nelems*sizeof(long));
        if ((source == NULL) || (target == NULL)) {
                fprintf(stderr,"\n dmapp_sheap_malloc FAILED\n");
                exit(1);
        }

        for (i=0; i<nelems; i++) {
                source[i] = i;
                target[i] = -9L;
        }

        /* Synchronize to make sure everyone's buffers are initialized before
           data transfer is started. */
        PMI_Barrier();

        /* Retrieve information about job details, such as PE id and number of PEs. */
        status = dmapp_get_jobinfo(&job);
        if (status != DMAPP_RC_SUCCESS) {
                fprintf(stderr,"\n dmapp_get_jobinfo FAILED: %d\n", status);
                exit(1);
        }
        pe = job.pe;
        npes = job.npes;

        /* Retrieve information about RMA attributes, such as offload_threshold
           and routing modes. */
        status = dmapp_get_rma_attrs(&actual_args);
        if (status != DMAPP_RC_SUCCESS) {
                fprintf(stderr,"\n dmapp_get_rma_attrs FAILED: %d\n", status);
                exit(1);
        }

        /* Specify in which segment the remote memory region (the source) lies.
           In this case, it is the sheap (see above). */
        seg = &(job.sheap_seg);

        fprintf(stderr," Hello from PE %d of %d, using seg start %p, seg size 0x%lx, offload_threshold %d\n",
                pe, npes, seg->addr, (unsigned long)seg->len, actual_args.offload_threshold);

        fprintf(stderr,"\n PE %d getting %d nelems from addr %p on PE %d to local addr %p",
                pe, nelems, (void *)source, npes-pe-1, (void *)source);

        /* Execute GET operation from remote memory region source on PE Y 
           into local memory region target on PE X. */
        status = dmapp_get(target, source, seg, npes-pe-1, nelems, DMAPP_QW);
        if (status != DMAPP_RC_SUCCESS) {
                fprintf(stderr,"\n dmapp_get FAILED: %d\n", status);
                exit(1);
        }

        /* Synchronize before verifying the data. */
        PMI_Barrier();

        /* Verify data received in target buffer. */
        for (i=0; i<nelems; i++) {
                if (target[i] != i) {
                        fprintf(stderr,"\n PE %d: target[%d] is %ld, should be %ld",
                                pe, i, target[i], (long)i);
                        fail_count++;
                }
        }
        if (fail_count == 0)
                fprintf(stderr,"\n dmapp_sample_get PASSED\n");
        else
                fprintf(stderr,"\n dmapp_sample_get FAILED: %d wrong values\n",
                        fail_count);

        /* Free buffers allocated from sheap. */
        dmapp_sheap_free(target);
        dmapp_sheap_free(source);

        /* Release DMAPP resources. This is a mandatory call. */
        status = dmapp_finalize();
        if (status != DMAPP_RC_SUCCESS) {
                fprintf(stderr,"\n dmapp_finalize FAILED: %d\n", status);
                exit(1);
        }
#endif
        return(0);
}