int A1D_Allocate_shared(void * ptrs[], int bytes) { void * tmp_ptr = NULL; int max_bytes = 0; #ifdef DEBUG_FUNCTION_ENTER_EXIT fprintf(stderr,"entering A1D_Allocate_shared(void* ptrs[], int bytes) \n"); #endif A1D_Barrier(); #ifdef __CRAYXE A1D_Allreduce_max32( bytes, &max_bytes ); /* allocate memory from symmetric heap */ tmp_ptr = dmapp_sheap_malloc( (size_t)max_bytes ); assert(tmp_ptr!=NULL); #endif /* allgather addresses into pointer vector */ A1D_Allgather( &tmp_ptr, ptrs, sizeof(void*) ); #ifdef DEBUG_FUNCTION_ENTER_EXIT fprintf(stderr,"exiting A1D_Allocate_shared(void* ptrs[], int bytes) \n"); #endif return(0); }
int A1D_Allocate_comm(MPI_Comm comm, void * ptrs[], int bytes) { int mpi_status = MPI_SUCCESS; void * tmp_ptr = NULL; int max_bytes = 0; #ifdef DEBUG_FUNCTION_ENTER_EXIT fprintf(stderr,"entering A1D_Allocate_comm(MPI_Comm comm, void * ptrs[], int bytes)\n"); #endif mpi_status = MPI_Barrier(comm); assert(mpi_status==0); #ifdef __CRAYXE A1D_Allreduce_max32(comm, bytes, &max_bytes ); /* allocate memory from symmetric heap */ tmp_ptr = dmapp_sheap_malloc( (size_t)max_bytes ); assert(tmp_ptr!=NULL); #endif /* allgather addresses into pointer vector */ A1D_Allgather(comm, &tmp_ptr, ptrs, sizeof(void*) ); #ifdef DEBUG_FUNCTION_ENTER_EXIT fprintf(stderr,"exiting A1D_Allocate_comm(MPI_Comm comm, void * ptrs[], int bytes)\n"); #endif return(0); }
int main(int argc,char **argv) { #ifdef __CRAYXE int max; int i; int pe = -1; int npes = -1; char * source = NULL; char * target = NULL; dmapp_return_t status; //dmapp_rma_attrs_t dmapp_config_in, dmapp_config_out; dmapp_rma_attrs_ext_t dmapp_config_in, dmapp_config_out; dmapp_jobinfo_t job; dmapp_seg_desc_t * seg = NULL; double t0, t1, dt; double bw; MPI_Init(&argc, &argv); /* Initialize DMAPP resources before executing any other DMAPP calls. */ //status = dmapp_init(NULL, &actual_args); dmapp_config_in.max_outstanding_nb = DMAPP_DEF_OUTSTANDING_NB; /* 512 */ dmapp_config_in.offload_threshold = DMAPP_OFFLOAD_THRESHOLD; /* 4096 */ //dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC; //dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC; dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE; dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE; dmapp_config_in.max_concurrency = 1; /* not thread-safe */ //dmapp_config_in.PI_ordering = DMAPP_PI_ORDERING_STRICT; dmapp_config_in.PI_ordering = DMAPP_PI_ORDERING_RELAXED; status = dmapp_init_ext( &dmapp_config_in, &dmapp_config_out ); assert(status==DMAPP_RC_SUCCESS); max = (argc>1) ? atoi(argv[1]) : 1000000; max *= 16; /* max must be a multiple of 16 for the test to work */ /* Allocate remotely accessible memory for source and target buffers. Only memory in the data segment or the sheap is remotely accessible. Here we allocate from the sheap. */ source = (char *)dmapp_sheap_malloc( max*sizeof(char) ); target = (char *)dmapp_sheap_malloc( max*sizeof(char) ); assert( (source!=NULL) && (target!=NULL)); memset (source,'S',max); memset (target,'T',max); /* Retrieve information about job details, such as PE id and number of PEs. */ status = dmapp_get_jobinfo(&job); assert(status==DMAPP_RC_SUCCESS); pe = job.pe; npes = job.npes; /* Retrieve information about RMA attributes, such as offload_threshold and routing modes. */ //status = dmapp_get_rma_attrs(&dmapp_config_out); status = dmapp_get_rma_attrs_ext(&dmapp_config_out); assert(status==DMAPP_RC_SUCCESS); /* Specify in which segment the remote memory region (the source) lies. In this case, it is the sheap (see above). */ seg = &(job.sheap_seg); if (pe == 0) fprintf(stderr," Hello from PE %d of %d, using seg start %p, seg size 0x%lx, offload_threshold %d \n", pe, npes, seg->addr, (unsigned long)seg->len, dmapp_config_out.offload_threshold); fflush(stderr); PMI_Barrier(); if (pe == 0) { fprintf(stderr,"%d: max = %d bytes, dmapp_put using DMAPP_DQW \n", pe, max); for (i=1; i<(max/16); i*=2) { t0 = MPI_Wtime(); status = dmapp_put(target, seg, 1, source, i, DMAPP_DQW); t1 = MPI_Wtime(); assert(status==DMAPP_RC_SUCCESS); dt = t1-t0; bw = 16 * 1e-6 * (double)i / dt; fprintf(stderr,"%d: %12d bytes %12lf seconds = %lf MB/s \n", pe, 16*i, dt, bw); } } fflush(stderr); PMI_Barrier(); if (pe == 0) { fprintf(stderr,"%d: max = %d bytes, dmapp_put using DMAPP_QW \n", pe, max); for (i=1; i<(max/8); i*=2) { t0 = MPI_Wtime(); status = dmapp_put(target, seg, 1, source, i, DMAPP_QW); t1 = MPI_Wtime(); assert(status==DMAPP_RC_SUCCESS); dt = t1-t0; bw = 8 * 1e-6 * (double)i / dt; fprintf(stderr,"%d: %12d bytes %12lf seconds = %lf MB/s \n", pe, 8*i, dt, bw); } } fflush(stderr); PMI_Barrier(); if (pe == 0) { fprintf(stderr,"%d: max = %d bytes, dmapp_put using DMAPP_DW \n", pe, max); for (i=1; i<(max/4); i*=2) { t0 = MPI_Wtime(); status = dmapp_put(target, seg, 1, source, i, DMAPP_DW); t1 = MPI_Wtime(); assert(status==DMAPP_RC_SUCCESS); dt = t1-t0; bw = 4 * 1e-6 * (double)i / dt; fprintf(stderr,"%d: %12d bytes %12lf seconds = %lf MB/s \n", pe, 4*i, dt, bw); } } fflush(stderr); PMI_Barrier(); if (pe == 0) { fprintf(stderr,"%d: max = %d bytes, dmapp_put using DMAPP_BYTE \n", pe, max); for (i=1; i<max; i*=2) { t0 = MPI_Wtime(); status = dmapp_put(target, seg, 1, source, i, DMAPP_BYTE); t1 = MPI_Wtime(); assert(status==DMAPP_RC_SUCCESS); dt = t1-t0; bw = 1 * 1e-6 * (double)i / dt; fprintf(stderr,"%d: %12d bytes %12lf seconds = %lf MB/s \n", pe, 1*i, dt, bw); } } fflush(stderr); PMI_Barrier(); /* Free buffers allocated from sheap. */ dmapp_sheap_free(target); dmapp_sheap_free(source); /* Release DMAPP resources. This is a mandatory call. */ status = dmapp_finalize(); assert(status==DMAPP_RC_SUCCESS); MPI_Finalize(); #endif return(0); }
int A1D_Initialize() { #ifdef DMAPPD_USES_MPI int mpi_initialized, mpi_provided; int mpi_status = MPI_SUCCESS; int namelen; char procname[MPI_MAX_PROCESSOR_NAME]; #endif #ifdef __CRAYXE int pmi_status = PMI_SUCCESS; int nodeid = -1; rca_mesh_coord_t rca_xyz; dmapp_return_t dmapp_status = DMAPP_RC_SUCCESS; dmapp_rma_attrs_ext_t dmapp_config_in, dmapp_config_out; dmapp_jobinfo_t dmapp_info; dmapp_pe_t dmapp_rank = -1; int dmapp_size = -1; #endif int sheapflag = 0; #ifdef DEBUG_FUNCTION_ENTER_EXIT fprintf(stderr,"entering A1D_Initialize() \n"); #endif #ifdef DMAPPD_USES_MPI /*************************************************** * * configure MPI * ***************************************************/ /* MPI has to be Initialized for this implementation to work */ MPI_Initialized(&mpi_initialized); assert(mpi_initialized==1); /* MPI has to tolerate threads because A1 supports them */ MPI_Query_thread(&mpi_provided); //assert(mpi_provided>MPI_THREAD_SINGLE); /* have to use our own communicator for collectives to be proper */ mpi_status = MPI_Comm_dup(MPI_COMM_WORLD,&A1D_COMM_WORLD); assert(mpi_status==0); /* get my MPI rank */ mpi_status = MPI_Comm_rank(A1D_COMM_WORLD,&mpi_rank); assert(mpi_status==0); /* get MPI world size */ mpi_status = MPI_Comm_size(A1D_COMM_WORLD,&mpi_size); assert(mpi_status==0); /* in a perfect world, this would provide topology information like BG */ MPI_Get_processor_name( procname, &namelen ); printf( "%d: MPI_Get_processor_name = %s\n" , mpi_rank, procname ); fflush( stdout ); /* barrier to make sure MPI is ready everywhere */ mpi_status = MPI_Barrier(A1D_COMM_WORLD); assert(mpi_status==0); #endif #ifdef __CRAYXE /*************************************************** * * query topology * ***************************************************/ PMI_Get_nid( mpi_rank, &nodeid ); assert(pmi_status==PMI_SUCCESS); rca_get_meshcoord((uint16_t)nodeid, &rca_xyz); printf("%d: rca_get_meshcoord returns (%2u,%2u,%2u)\n", mpi_rank, rca_xyz.mesh_x, rca_xyz.mesh_y, rca_xyz.mesh_z ); #endif #ifdef __CRAYXE /*************************************************** * * configure DMAPP * ***************************************************/ dmapp_config_in.max_outstanding_nb = DMAPP_DEF_OUTSTANDING_NB; /* 512 */ dmapp_config_in.offload_threshold = DMAPP_OFFLOAD_THRESHOLD; /* 4096 */ #ifdef DETERMINISTIC_ROUTING dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC; dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_DETERMINISTIC; #else dmapp_config_in.put_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE; dmapp_config_in.get_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE; #endif dmapp_config_in.max_concurrency = 1; /* not thread-safe */ #ifdef FLUSH_IMPLEMENTED dmapp_config_in.PI_ordering = DMAPP_PI_ORDERING_RELAXED; #else dmapp_config_in.PI_ordering = DMAPP_PI_ORDERING_STRICT; #endif dmapp_status = dmapp_init_ext( &dmapp_config_in, &dmapp_config_out ); assert(dmapp_status==DMAPP_RC_SUCCESS); #ifndef FLUSH_IMPLEMENTED /* without strict PI ordering, we have to flush remote stores with a get packet to force global visibility */ assert( dmapp_config_out.PI_ordering == DMAPP_PI_ORDERING_STRICT); #endif dmapp_status = dmapp_get_jobinfo(&dmapp_info); assert(dmapp_status==DMAPP_RC_SUCCESS); dmapp_rank = dmapp_info.pe; dmapp_size = dmapp_info.npes; A1D_Sheap_desc = dmapp_info.sheap_seg; /* make sure PMI and DMAPP agree */ assert(mpi_rank==dmapp_rank); assert(mpi_size==dmapp_size); #endif /*************************************************** * * setup protocols * ***************************************************/ #ifdef FLUSH_IMPLEMENTED /* allocate Put list */ A1D_Put_flush_list = malloc( mpi_size * sizeof(int32_t) ); assert(A1D_Put_flush_list != NULL); #endif #ifdef __CRAYXE A1D_Acc_lock = dmapp_sheap_malloc( sizeof(int64_t) ); #endif A1D_Allreduce_issame64((size_t)A1D_Acc_lock, &sheapflag); assert(sheapflag==1); #ifdef DEBUG_FUNCTION_ENTER_EXIT fprintf(stderr,"exiting A1D_Initialize() \n"); #endif return(0); }
int main(int argc, char **argv) { #ifdef __CRAYXE int i,j; int me = -1; int size = -1; //int fail_count = 0; dmapp_return_t status; dmapp_rma_attrs_t actual_args = { 0 }, rma_args = { 0 }; dmapp_jobinfo_t job; dmapp_seg_desc_t *seg = NULL; /* Set the RMA parameters. */ rma_args.put_relaxed_ordering = DMAPP_ROUTING_ADAPTIVE; rma_args.max_outstanding_nb = DMAPP_DEF_OUTSTANDING_NB; rma_args.offload_threshold = DMAPP_OFFLOAD_THRESHOLD; rma_args.max_concurrency = 1; /* Initialize DMAPP. */ status = dmapp_init(&rma_args, &actual_args); assert(status==DMAPP_RC_SUCCESS); /* Get job related information. */ status = dmapp_get_jobinfo(&job); assert(status==DMAPP_RC_SUCCESS); me = job.pe; size = job.npes; seg = &(job.sheap_seg); /* Allocate and initialize the source and target arrays. */ long * source = (long *) dmapp_sheap_malloc( size * sizeof(long) ); assert(source!=NULL); long * target = (long *) dmapp_sheap_malloc( size * sizeof(long) ); assert(target!=NULL); for (i = 0; i < size; i++) source[i] = 0; for (i = 0; i < size; i++) target[i] = 0; /* Wait for all PEs to complete array initialization. */ PMI_Barrier(); /* compare-and-swap */ // // dmapp_return_t dmapp_acswap_qw( // IN void *target_addr /* local memory */, // IN void *source_addr /* remote memory */, // IN dmapp_seg_desc_t *source_seg /* remote segment */, // IN dmapp_pe_t source_pe /* remote rank */, // IN int64_t comperand, // IN int64_t swaperand); // for (i = 0; i < size; i++) if (i != me) { status = dmapp_acswap_qw(&source[i], &target[i], seg, (dmapp_pe_t)i, (int64_t)0, (int64_t)me); if (status==DMAPP_RC_SUCCESS) printf("%d: DMAPP_RC_SUCCESS\n",me); else if (status==DMAPP_RC_INVALID_PARAM) printf("%d: DMAPP_RC_INVALID_PARAM\n",me); else if (status==DMAPP_RC_ALIGNMENT_ERROR) printf("%d: DMAPP_RC_ALIGNMENT_ERROR\n",me); else if (status==DMAPP_RC_NO_SPACE) printf("%d: DMAPP_RC_NO_SPACE\n",me); else if (status==DMAPP_RC_TRANSACTION_ERROR) printf("%d: DMAPP_RC_TRANSACTION_ERROR\n",me); fflush(stdout); assert(status==DMAPP_RC_SUCCESS); } /* Wait for all PEs. */ PMI_Barrier(); /* see who won */ for (i = 0; i < size; i++) { if (i==me) { for (j = 0; j < size; j++) printf("me = %d target[%d] = %ld\n", me, i, target[i] ); printf("==========================================\n"); fflush(stdout); } PMI_Barrier(); } /* Finalize. */ status = dmapp_finalize(); assert(status==DMAPP_RC_SUCCESS); #endif return(0); }
int main(int argc,char **argv) { #ifdef __CRAYXE int nelems = 128; int i; int pe = -1; int npes = -1; int fail_count = 0; long *source = NULL; long *target = NULL; dmapp_return_t status; dmapp_rma_attrs_t actual_args; dmapp_jobinfo_t job; dmapp_seg_desc_t *seg = NULL; /* Initialize DMAPP resources before executing any other DMAPP calls. */ status = dmapp_init(NULL, &actual_args); if (status != DMAPP_RC_SUCCESS) { fprintf(stderr,"\n dmapp_init FAILED: %d\n", status); exit(1); } /* Allocate remotely accessible memory for source and target buffers. Only memory in the data segment or the sheap is remotely accessible. Here we allocate from the sheap. */ source = (long *)dmapp_sheap_malloc(nelems*sizeof(long)); target = (long *)dmapp_sheap_malloc(nelems*sizeof(long)); if ((source == NULL) || (target == NULL)) { fprintf(stderr,"\n dmapp_sheap_malloc FAILED\n"); exit(1); } for (i=0; i<nelems; i++) { source[i] = i; target[i] = -9L; } /* Synchronize to make sure everyone's buffers are initialized before data transfer is started. */ PMI_Barrier(); /* Retrieve information about job details, such as PE id and number of PEs. */ status = dmapp_get_jobinfo(&job); if (status != DMAPP_RC_SUCCESS) { fprintf(stderr,"\n dmapp_get_jobinfo FAILED: %d\n", status); exit(1); } pe = job.pe; npes = job.npes; /* Retrieve information about RMA attributes, such as offload_threshold and routing modes. */ status = dmapp_get_rma_attrs(&actual_args); if (status != DMAPP_RC_SUCCESS) { fprintf(stderr,"\n dmapp_get_rma_attrs FAILED: %d\n", status); exit(1); } /* Specify in which segment the remote memory region (the source) lies. In this case, it is the sheap (see above). */ seg = &(job.sheap_seg); fprintf(stderr," Hello from PE %d of %d, using seg start %p, seg size 0x%lx, offload_threshold %d\n", pe, npes, seg->addr, (unsigned long)seg->len, actual_args.offload_threshold); fprintf(stderr,"\n PE %d getting %d nelems from addr %p on PE %d to local addr %p", pe, nelems, (void *)source, npes-pe-1, (void *)source); /* Execute GET operation from remote memory region source on PE Y into local memory region target on PE X. */ status = dmapp_get(target, source, seg, npes-pe-1, nelems, DMAPP_QW); if (status != DMAPP_RC_SUCCESS) { fprintf(stderr,"\n dmapp_get FAILED: %d\n", status); exit(1); } /* Synchronize before verifying the data. */ PMI_Barrier(); /* Verify data received in target buffer. */ for (i=0; i<nelems; i++) { if (target[i] != i) { fprintf(stderr,"\n PE %d: target[%d] is %ld, should be %ld", pe, i, target[i], (long)i); fail_count++; } } if (fail_count == 0) fprintf(stderr,"\n dmapp_sample_get PASSED\n"); else fprintf(stderr,"\n dmapp_sample_get FAILED: %d wrong values\n", fail_count); /* Free buffers allocated from sheap. */ dmapp_sheap_free(target); dmapp_sheap_free(source); /* Release DMAPP resources. This is a mandatory call. */ status = dmapp_finalize(); if (status != DMAPP_RC_SUCCESS) { fprintf(stderr,"\n dmapp_finalize FAILED: %d\n", status); exit(1); } #endif return(0); }