/** * \brief Global fence operation. * * Blocks until all active messages between the local node and all remote * nodes have completed and acknowledged by the remote node. * * \see ARMCIX_Fence * \see ARMCIX_DCMF_ReceiveFenceRequest * \see ARMCIX_DCMF_ReceiveFenceAck */ void ARMCIX_AllFence () { DCMF_CriticalSection_enter (0); unsigned size = DCMF_Messager_size (); unsigned peer; volatile unsigned active = 0; DCQuad quad; DCMF_Callback_t * cb = (DCMF_Callback_t *) &quad; cb->function = ARMCIX_DCMF_cb_decrement; cb->clientdata = (void *) &active; DCMF_Callback_t cb_null = { NULL, NULL }; DCMF_Callback_t cb_done = { (void (*)(void *, DCMF_Error_t *))ARMCIX_DCMF_request_free, NULL }; for (peer = 0; peer < size; peer++) { ARMCIX_DCMF_Request_t * new_request = ARMCIX_DCMF_request_allocate (cb_null); cb_done.clientdata = new_request; active++; DCMF_Send ( &__fence_rts_protocol, &(new_request->request), cb_done, DCMF_SEQUENTIAL_CONSISTENCY, peer, 0, NULL, (DCQuad *) &quad, 1); while (active) DCMF_Messager_advance (); } DCMF_CriticalSection_exit (0); }
int main() { int i, rank, nranks, msgsize, status, expected; long bufsize; int *src_buffer; int *trg_buffer; unsigned *ranks; DCMF_Result dcmf_result; DCMF_CollectiveProtocol_t barrier_protocol, lbarrier_protocol; DCMF_CollectiveProtocol_t allreduce_protocol, allreduce_notree_protocol; DCMF_Barrier_Configuration_t barrier_conf; DCMF_Allreduce_Configuration_t allreduce_conf; DCMF_CollectiveRequest_t crequest, crequest1, crequest2; DCMF_Callback_t done_callback; volatile unsigned allreduce_active = 0; DCMF_Messager_initialize(); dcmf_result = DCMF_Collective_initialize(); assert(dcmf_result == DCMF_SUCCESS); rank = DCMF_Messager_rank(); nranks = DCMF_Messager_size(); ranks = (unsigned *) malloc(nranks * sizeof(int)); for(i=0; i<nranks; i++) ranks[i] = i; bufsize = MAX_MSG_SIZE; src_buffer = (int *) malloc(bufsize); trg_buffer = (int *) malloc(bufsize); barrier_conf.protocol = DCMF_GI_BARRIER_PROTOCOL; barrier_conf.cb_geometry = getGeometry; dcmf_result = DCMF_Barrier_register(&barrier_protocol, &barrier_conf); assert(dcmf_result == DCMF_SUCCESS); barrier_conf.protocol = DCMF_LOCKBOX_BARRIER_PROTOCOL; barrier_conf.cb_geometry = getGeometry; dcmf_result = DCMF_Barrier_register(&lbarrier_protocol, &barrier_conf); assert(dcmf_result == DCMF_SUCCESS); DCMF_CollectiveProtocol_t *barrier_ptr, *lbarrier_ptr; barrier_ptr = &barrier_protocol; lbarrier_ptr = &lbarrier_protocol; dcmf_result = DCMF_Geometry_initialize(&geometry, 0, ranks, nranks, &barrier_ptr, 1, &lbarrier_ptr, 1, &crequest, 0, 1); assert(dcmf_result == DCMF_SUCCESS); allreduce_conf.protocol = DCMF_TREE_ALLREDUCE_PROTOCOL; allreduce_conf.cb_geometry = getGeometry; allreduce_conf.reuse_storage = 1; dcmf_result = DCMF_Allreduce_register(&allreduce_protocol, &allreduce_conf); assert(dcmf_result == DCMF_SUCCESS); allreduce_conf.protocol = DCMF_TORUS_BINOMIAL_ALLREDUCE_PROTOCOL; allreduce_conf.cb_geometry = getGeometry; allreduce_conf.reuse_storage = 1; dcmf_result = DCMF_Allreduce_register(&allreduce_notree_protocol, &allreduce_conf); assert(dcmf_result == DCMF_SUCCESS); status = DCMF_Geometry_analyze(&geometry, &allreduce_protocol); assert(status == 1); status = DCMF_Geometry_analyze(&geometry, &allreduce_notree_protocol); assert(status == 1); done_callback.function = done; done_callback.clientdata = (void *) &allreduce_active; if (rank == 0) { printf("DCMF_Allreduce Test\n"); fflush(stdout); } for (msgsize = sizeof(int); msgsize < MAX_MSG_SIZE; msgsize *= 2) { /*initializing buffer*/ for (i = 0; i < bufsize/sizeof(int); i++) { src_buffer[i] = rank; trg_buffer[i] = 0; } allreduce_active += 1; /*sum reduce operation*/ dcmf_result = DCMF_Allreduce(&allreduce_protocol, &crequest1, done_callback, DCMF_SEQUENTIAL_CONSISTENCY, &geometry, (char *) src_buffer, (char *) trg_buffer, msgsize/sizeof(int), DCMF_SIGNED_INT, DCMF_SUM); assert(dcmf_result == DCMF_SUCCESS); while(allreduce_active > 0) DCMF_Messager_advance(); expected = (nranks-1)*(nranks)/2; for (i = 0; i < msgsize/sizeof(int); i++) { if(trg_buffer[i] - expected != 0) { printf("[%d] Validation has failed Expected: %d, Actual: %d, i: %d \n", rank, expected, trg_buffer[i], i); fflush(stdout); exit(-1); } } printf("[%d] %d message sum allreduce successful \n", rank, msgsize); fflush(stdout); for (i = 0; i < bufsize/sizeof(int); i++) { src_buffer[i] = 1; trg_buffer[i] = 0; } allreduce_active += 1; /*sum reduce operation*/ dcmf_result = DCMF_Allreduce(&allreduce_notree_protocol, &crequest2, done_callback, DCMF_SEQUENTIAL_CONSISTENCY, &geometry, (char *) src_buffer, (char *) trg_buffer, msgsize/sizeof(int), DCMF_SIGNED_INT, DCMF_PROD); assert(dcmf_result == DCMF_SUCCESS); while(allreduce_active > 0) DCMF_Messager_advance(); expected = 1; for (i = 0; i < msgsize/sizeof(int); i++) { if(trg_buffer[i] - expected != 0) { printf("[%d] Validation has failed Expected: %d, Actual: %d, i: %d \n", rank, expected, trg_buffer[i], i); fflush(stdout); exit(-1); } } printf("[%d] %d message product allreduce successful\n", rank, msgsize); fflush(stdout); } free(src_buffer); free(trg_buffer); DCMF_Messager_finalize(); return 0; }
int A1D_Initialize() { int mpi_initialized, mpi_provided; int mpi_status; int i; size_t bytes_in, bytes_out; DCMF_Result dcmf_result; DCMF_Configure_t dcmf_config; DCMF_Memregion_t local_memregion; /*************************************************** * * configure MPI * ***************************************************/ /* MPI has to be initialized for this implementation to work */ MPI_Initialized(&mpi_initialized); assert(mpi_initialized==1); /* MPI has to be thread-safe so that DCMF doesn't explode */ MPI_Query_thread(&mpi_provided); assert(mpi_provided==MPI_THREAD_MULTIPLE); /* have to use our own communicator for collectives to be proper */ mpi_status = MPI_Comm_dup(MPI_COMM_WORLD,&A1D_COMM_WORLD); assert(mpi_status==0); /* get my MPI rank */ mpi_status = MPI_Comm_rank(A1D_COMM_WORLD,&myrank); assert(mpi_status==0); /* get MPI world size */ mpi_status = MPI_Comm_size(A1D_COMM_WORLD,&mpi_size); assert(mpi_status==0); /* make sure MPI and DCMF agree */ assert(myrank==DCMF_Messager_rank()); assert(mpi_size==DCMF_Messager_size()); /* barrier before DCMF_Messager_configure to make sure MPI is ready everywhere */ mpi_status = MPI_Barrier(A1D_COMM_WORLD); assert(mpi_status==0); /*************************************************** * * configure DCMF * ***************************************************/ /* to be safe, but perhaps not necessary */ dcmf_config.thread_level = DCMF_THREAD_MULTIPLE; #ifdef ACCUMULATE_IMPLEMENTED /* interrupts required for accumulate only, Put/Get use DMA * if accumulate not used, MPI will query environment for DCMF_INTERRUPTS */ dcmf_config.interrupts = DCMF_INTERRUPTS_ON; #endif /* reconfigure DCMF with interrupts on */ DCMF_CriticalSection_enter(0); dcmf_result = DCMF_Messager_configure(&dcmf_config, &dcmf_config); assert(dcmf_result==DCMF_SUCCESS); DCMF_CriticalSection_exit(0); /* barrier after DCMF_Messager_configure to make sure everyone has the new DCMF config */ mpi_status = MPI_Barrier(A1D_COMM_WORLD); assert(mpi_status==0); /*************************************************** * * setup DCMF memregions * ***************************************************/ /* allocate memregion list */ A1D_Memregion_list = malloc( mpi_size * sizeof(DCMF_Memregion_t) ); assert(A1D_Memregion_list != NULL); /* allocate base pointer list */ A1D_Baseptr_list = malloc( mpi_size * sizeof(void*) ); assert(A1D_Memregion_list != NULL); /* create memregions */ bytes_in = -1; DCMF_CriticalSection_enter(0); dcmf_result = DCMF_Memregion_create(&local_memregion,&bytes_out,bytes_in,NULL,0); assert(dcmf_result==DCMF_SUCCESS); DCMF_CriticalSection_exit(0); /* exchange memregions because we don't use symmetry heap */ mpi_status = MPI_Allgather(&local_memregion,sizeof(DCMF_Memregion_t),MPI_BYTE, A1D_Memregion_list,sizeof(DCMF_Memregion_t),MPI_BYTE, A1D_COMM_WORLD); assert(mpi_status==0); /* destroy temporary local memregion */ DCMF_CriticalSection_enter(0); dcmf_result = DCMF_Memregion_destroy(&local_memregion); assert(dcmf_result==DCMF_SUCCESS); DCMF_CriticalSection_exit(0); /* check for valid memregions */ DCMF_CriticalSection_enter(0); for (i = 0; i < mpi_size; i++) { dcmf_result = DCMF_Memregion_query(&A1D_Memregion_list[i], &bytes_out, &A1D_Baseptr_list[i]); assert(dcmf_result==DCMF_SUCCESS); } DCMF_CriticalSection_exit(0); #ifdef FLUSH_IMPLEMENTED /*************************************************** * * setup flush list(s) * ***************************************************/ /* allocate Put list */ A1D_Put_flush_list = malloc( mpi_size * sizeof(int) ); assert(A1D_Put_flush_list != NULL); #ifdef ACCUMULATE_IMPLEMENTED /* allocate Acc list */ A1D_Send_flush_list = malloc( mpi_size * sizeof(int) ); assert(A1D_Send_flush_list != NULL); #endif #endif /*************************************************** * * define null callback * ***************************************************/ A1D_Nocallback.function = NULL; A1D_Nocallback.clientdata = NULL; return(0); }
int main() { int i, rank, nranks, msgsize, status, expected; long bufsize; int *buffer; DCMF_Protocol_t ga_protocol; DCMF_GlobalAllreduce_Configuration_t ga_conf; DCMF_Request_t request; DCMF_Callback_t done_callback; volatile unsigned ga_active = 0; DCMF_Messager_initialize(); rank = DCMF_Messager_rank(); nranks = DCMF_Messager_size(); bufsize = MAX_MSG_SIZE; buffer = (int *) malloc(bufsize); ga_conf.protocol = DCMF_DEFAULT_GLOBALALLREDUCE_PROTOCOL; status = DCMF_GlobalAllreduce_register(&ga_protocol, &ga_conf); if(status != DCMF_SUCCESS) { printf("DCMF_GlobalAllreduce_register returned with error %d \n", status); exit(-1); } done_callback.function = done; done_callback.clientdata = (void *) &ga_active; if (rank == 0) { printf("DCMF_Allreduce Test\n"); fflush(stdout); } for (msgsize = sizeof(int); msgsize < MAX_MSG_SIZE; msgsize *= 2) { /*initializing buffer*/ for (i = 0; i < bufsize/sizeof(int); i++) { buffer[i] = rank; } ga_active += 1; /*sum reduce operation*/ status = DCMF_GlobalAllreduce(&ga_protocol, &request, done_callback, DCMF_SEQUENTIAL_CONSISTENCY, -1, (char *) buffer, (char *) buffer, msgsize/sizeof(int), DCMF_SIGNED_INT, DCMF_SUM); while(ga_active > 0) DCMF_Messager_advance(); expected = (nranks-1)*(nranks)/2; for (i = 0; i < msgsize/sizeof(int); i++) { if(buffer[i] - expected != 0) { printf("[%d] Validation has failed Expected: %d, Actual: %d, i: %d \n", rank, expected, buffer[i], i); fflush(stdout); exit(-1); } } printf("[%d] %d message sum reduce successful \n", rank, msgsize); fflush(stdout); for (i = 0; i < bufsize/sizeof(int); i++) { buffer[i] = 1; } ga_active += 1; status = DCMF_GlobalAllreduce(&ga_protocol, &request, done_callback, DCMF_SEQUENTIAL_CONSISTENCY, -1, (char *) buffer, (char *) buffer, msgsize/sizeof(int), DCMF_SIGNED_INT, DCMF_PROD); while(ga_active > 0) DCMF_Messager_advance(); expected = 1; for (i = 0; i < msgsize/sizeof(int); i++) { if(buffer[i] - expected != 0) { printf("[%d] Validation has failed Expected: %d, Actual: %d, i: %d \n", rank, expected, buffer[i], i); fflush(stdout); exit(-1); } } printf("[%d] %d message product reduce successful\n", rank, msgsize); fflush(stdout); } free(buffer); DCMF_Messager_finalize(); return 0; }
void ARMCIX_DCMF_Connection_initialize () { DCMF_CriticalSection_enter(0); __global_connection.peer = (unsigned) -1; unsigned rank = DCMF_Messager_rank (); unsigned size = DCMF_Messager_size (); posix_memalign ((void **)&__connection, 16, sizeof(ARMCIX_DCMF_Connection_t) * size); bzero ((void *)__connection, sizeof(ARMCIX_DCMF_Connection_t) * size); void * base = NULL; size_t bytes = (size_t) -1; unsigned i; for (i = 0; i < size; i++) { __connection[i].peer = i; #warning fix memregion setup to handle non-global address space pinning. //DCMF_Result result = DCMF_Memregion_create (&__connection[i].local_mem_region, &bytes, (size_t) -1, NULL, 0); } // Register a send protocol to exchange memory regions DCMF_Protocol_t send_protocol; DCMF_Send_Configuration_t send_configuration = { DCMF_DEFAULT_SEND_PROTOCOL, DCMF_DEFAULT_NETWORK, ARMCIX_DCMF_RecvMemregion1, __connection, ARMCIX_DCMF_RecvMemregion2, __connection }; DCMF_Send_register (&send_protocol, &send_configuration); DCMF_Request_t request; volatile unsigned active; DCMF_Callback_t cb_done = { ARMCIX_DCMF_cb_decrement, (void *) &active }; // Exchange the memory regions __memregions_to_receive = size; for (i = 0; i < size; i++) { unsigned peer = (rank+i)%size; active = 1; DCMF_Send (&send_protocol, &request, cb_done, DCMF_SEQUENTIAL_CONSISTENCY, peer, sizeof(DCMF_Memregion_t), (char *) &__connection[peer].local_mem_region, (DCQuad *) NULL, 0); while (active) DCMF_Messager_advance(); } while (__memregions_to_receive) DCMF_Messager_advance(); DCMF_CriticalSection_exit(0); }