/** * \brief Global fence operation. * * Blocks until all active messages between the local node and all remote * nodes have completed and acknowledged by the remote node. * * \see ARMCIX_Fence * \see ARMCIX_DCMF_ReceiveFenceRequest * \see ARMCIX_DCMF_ReceiveFenceAck */ void ARMCIX_AllFence () { DCMF_CriticalSection_enter (0); unsigned size = DCMF_Messager_size (); unsigned peer; volatile unsigned active = 0; DCQuad quad; DCMF_Callback_t * cb = (DCMF_Callback_t *) &quad; cb->function = ARMCIX_DCMF_cb_decrement; cb->clientdata = (void *) &active; DCMF_Callback_t cb_null = { NULL, NULL }; DCMF_Callback_t cb_done = { (void (*)(void *, DCMF_Error_t *))ARMCIX_DCMF_request_free, NULL }; for (peer = 0; peer < size; peer++) { ARMCIX_DCMF_Request_t * new_request = ARMCIX_DCMF_request_allocate (cb_null); cb_done.clientdata = new_request; active++; DCMF_Send ( &__fence_rts_protocol, &(new_request->request), cb_done, DCMF_SEQUENTIAL_CONSISTENCY, peer, 0, NULL, (DCQuad *) &quad, 1); while (active) DCMF_Messager_advance (); } DCMF_CriticalSection_exit (0); }
int main() { int i, rank, nranks, msgsize, status, expected; long bufsize; int *src_buffer; int *trg_buffer; unsigned *ranks; DCMF_Result dcmf_result; DCMF_CollectiveProtocol_t barrier_protocol, lbarrier_protocol; DCMF_CollectiveProtocol_t allreduce_protocol, allreduce_notree_protocol; DCMF_Barrier_Configuration_t barrier_conf; DCMF_Allreduce_Configuration_t allreduce_conf; DCMF_CollectiveRequest_t crequest, crequest1, crequest2; DCMF_Callback_t done_callback; volatile unsigned allreduce_active = 0; DCMF_Messager_initialize(); dcmf_result = DCMF_Collective_initialize(); assert(dcmf_result == DCMF_SUCCESS); rank = DCMF_Messager_rank(); nranks = DCMF_Messager_size(); ranks = (unsigned *) malloc(nranks * sizeof(int)); for(i=0; i<nranks; i++) ranks[i] = i; bufsize = MAX_MSG_SIZE; src_buffer = (int *) malloc(bufsize); trg_buffer = (int *) malloc(bufsize); barrier_conf.protocol = DCMF_GI_BARRIER_PROTOCOL; barrier_conf.cb_geometry = getGeometry; dcmf_result = DCMF_Barrier_register(&barrier_protocol, &barrier_conf); assert(dcmf_result == DCMF_SUCCESS); barrier_conf.protocol = DCMF_LOCKBOX_BARRIER_PROTOCOL; barrier_conf.cb_geometry = getGeometry; dcmf_result = DCMF_Barrier_register(&lbarrier_protocol, &barrier_conf); assert(dcmf_result == DCMF_SUCCESS); DCMF_CollectiveProtocol_t *barrier_ptr, *lbarrier_ptr; barrier_ptr = &barrier_protocol; lbarrier_ptr = &lbarrier_protocol; dcmf_result = DCMF_Geometry_initialize(&geometry, 0, ranks, nranks, &barrier_ptr, 1, &lbarrier_ptr, 1, &crequest, 0, 1); assert(dcmf_result == DCMF_SUCCESS); allreduce_conf.protocol = DCMF_TREE_ALLREDUCE_PROTOCOL; allreduce_conf.cb_geometry = getGeometry; allreduce_conf.reuse_storage = 1; dcmf_result = DCMF_Allreduce_register(&allreduce_protocol, &allreduce_conf); assert(dcmf_result == DCMF_SUCCESS); allreduce_conf.protocol = DCMF_TORUS_BINOMIAL_ALLREDUCE_PROTOCOL; allreduce_conf.cb_geometry = getGeometry; allreduce_conf.reuse_storage = 1; dcmf_result = DCMF_Allreduce_register(&allreduce_notree_protocol, &allreduce_conf); assert(dcmf_result == DCMF_SUCCESS); status = DCMF_Geometry_analyze(&geometry, &allreduce_protocol); assert(status == 1); status = DCMF_Geometry_analyze(&geometry, &allreduce_notree_protocol); assert(status == 1); done_callback.function = done; done_callback.clientdata = (void *) &allreduce_active; if (rank == 0) { printf("DCMF_Allreduce Test\n"); fflush(stdout); } for (msgsize = sizeof(int); msgsize < MAX_MSG_SIZE; msgsize *= 2) { /*initializing buffer*/ for (i = 0; i < bufsize/sizeof(int); i++) { src_buffer[i] = rank; trg_buffer[i] = 0; } allreduce_active += 1; /*sum reduce operation*/ dcmf_result = DCMF_Allreduce(&allreduce_protocol, &crequest1, done_callback, DCMF_SEQUENTIAL_CONSISTENCY, &geometry, (char *) src_buffer, (char *) trg_buffer, msgsize/sizeof(int), DCMF_SIGNED_INT, DCMF_SUM); assert(dcmf_result == DCMF_SUCCESS); while(allreduce_active > 0) DCMF_Messager_advance(); expected = (nranks-1)*(nranks)/2; for (i = 0; i < msgsize/sizeof(int); i++) { if(trg_buffer[i] - expected != 0) { printf("[%d] Validation has failed Expected: %d, Actual: %d, i: %d \n", rank, expected, trg_buffer[i], i); fflush(stdout); exit(-1); } } printf("[%d] %d message sum allreduce successful \n", rank, msgsize); fflush(stdout); for (i = 0; i < bufsize/sizeof(int); i++) { src_buffer[i] = 1; trg_buffer[i] = 0; } allreduce_active += 1; /*sum reduce operation*/ dcmf_result = DCMF_Allreduce(&allreduce_notree_protocol, &crequest2, done_callback, DCMF_SEQUENTIAL_CONSISTENCY, &geometry, (char *) src_buffer, (char *) trg_buffer, msgsize/sizeof(int), DCMF_SIGNED_INT, DCMF_PROD); assert(dcmf_result == DCMF_SUCCESS); while(allreduce_active > 0) DCMF_Messager_advance(); expected = 1; for (i = 0; i < msgsize/sizeof(int); i++) { if(trg_buffer[i] - expected != 0) { printf("[%d] Validation has failed Expected: %d, Actual: %d, i: %d \n", rank, expected, trg_buffer[i], i); fflush(stdout); exit(-1); } } printf("[%d] %d message product allreduce successful\n", rank, msgsize); fflush(stdout); } free(src_buffer); free(trg_buffer); DCMF_Messager_finalize(); return 0; }
void get_contention() { unsigned int iter, size, dst; unsigned int i, j, k, s; unsigned int xdim, ydim, zdim; unsigned int xdisp, ydisp, zdisp; DCMF_Request_t get_req[ITERATIONS]; DCMF_Callback_t get_done; unsigned int done_count; DCMF_NetworkCoord_t myaddr, dstaddr; DCMF_Network ntwk; char buf[50]; get_done.function = done; get_done.clientdata = (void *) &done_count; DCMF_Messager_rank2network(nranks - 1, DCMF_TORUS_NETWORK, &dstaddr); xdim = dstaddr.torus.x + 1; ydim = dstaddr.torus.y + 1; zdim = dstaddr.torus.z + 1; if (myrank == 0) { printf("Dimensions of Torus : %d, %d, %d \n", xdim, ydim, zdim); fflush(stdout); } DCMF_Messager_rank2network(myrank, DCMF_TORUS_NETWORK, &myaddr); dstaddr.network = myaddr.network; dstaddr.torus.t = myaddr.torus.t; int size_array[] = { 8, 64, 512, 4096, 32768, 262144, 1048576 }; int size_count = sizeof(size_array) / sizeof(int); int disp_array[][3] = { { 0, 0, 1 }, { 0, 0, 3 }, { 0, 3, 3 }, { 3, 3, 3 }, { 0, 1, 3 }, { 1, 1, 3 }, { 0, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 1, 3, 3 }, { 2, 3, 3 } }; int disp_count = sizeof(disp_array) / (sizeof(int) * 3); for (s = 0; s < size_count; s++) { size = size_array[s]; if (myrank == 0) { printf("Message Size : %20d \n", size); printf("%30s %20s \n", "Displacement b/w Pairs", "Avg Bandwidth (Mbps)"); fflush(stdout); } /*Assumes all dimensions are equal*/ for (i = 0; i < disp_count; i++) { xdisp = disp_array[i][0]; ydisp = disp_array[i][1]; zdisp = disp_array[i][2]; dstaddr.torus.x = (myaddr.torus.x + xdisp) % xdim; dstaddr.torus.y = (myaddr.torus.y + ydisp) % ydim; dstaddr.torus.z = (myaddr.torus.z + zdisp) % zdim; DCMF_Messager_network2rank(&dstaddr, &dst, &ntwk); barrier(); /*********************** * start timer * ***********************/ t_start = DCMF_Timebase(); done_count = ITERATIONS; for (iter = 0; iter < ITERATIONS; iter++) { DCMF_Get(&get_reg, &get_req[iter], get_done, DCMF_SEQUENTIAL_CONSISTENCY, dst, size, memregion[dst], memregion[myrank], MAX_MSG_SIZE * ITERATIONS + iter * size, iter * size); } while (done_count) DCMF_Messager_advance(); t_stop = DCMF_Timebase(); /*********************** * stop timer * ***********************/ t_sec = (t_stop - t_start) / (clockMHz * 1000000); bw = (ITERATIONS * size) / (t_sec * 1024 * 1024); barrier(); allreduce(-1, (char *) &bw, (char *) &bw_avg, 1, DCMF_DOUBLE, DCMF_SUM); if (myrank == 0) { bw_avg = bw_avg / nranks; sprintf(buf, "(%d)(%d)(%d)", xdisp, ydisp, zdisp); printf("%30s %20.0f \n", buf, bw_avg); fflush(stdout); } } } }
int main() { int i, rank, nranks, msgsize, status, expected; long bufsize; int *buffer; DCMF_Protocol_t ga_protocol; DCMF_GlobalAllreduce_Configuration_t ga_conf; DCMF_Request_t request; DCMF_Callback_t done_callback; volatile unsigned ga_active = 0; DCMF_Messager_initialize(); rank = DCMF_Messager_rank(); nranks = DCMF_Messager_size(); bufsize = MAX_MSG_SIZE; buffer = (int *) malloc(bufsize); ga_conf.protocol = DCMF_DEFAULT_GLOBALALLREDUCE_PROTOCOL; status = DCMF_GlobalAllreduce_register(&ga_protocol, &ga_conf); if(status != DCMF_SUCCESS) { printf("DCMF_GlobalAllreduce_register returned with error %d \n", status); exit(-1); } done_callback.function = done; done_callback.clientdata = (void *) &ga_active; if (rank == 0) { printf("DCMF_Allreduce Test\n"); fflush(stdout); } for (msgsize = sizeof(int); msgsize < MAX_MSG_SIZE; msgsize *= 2) { /*initializing buffer*/ for (i = 0; i < bufsize/sizeof(int); i++) { buffer[i] = rank; } ga_active += 1; /*sum reduce operation*/ status = DCMF_GlobalAllreduce(&ga_protocol, &request, done_callback, DCMF_SEQUENTIAL_CONSISTENCY, -1, (char *) buffer, (char *) buffer, msgsize/sizeof(int), DCMF_SIGNED_INT, DCMF_SUM); while(ga_active > 0) DCMF_Messager_advance(); expected = (nranks-1)*(nranks)/2; for (i = 0; i < msgsize/sizeof(int); i++) { if(buffer[i] - expected != 0) { printf("[%d] Validation has failed Expected: %d, Actual: %d, i: %d \n", rank, expected, buffer[i], i); fflush(stdout); exit(-1); } } printf("[%d] %d message sum reduce successful \n", rank, msgsize); fflush(stdout); for (i = 0; i < bufsize/sizeof(int); i++) { buffer[i] = 1; } ga_active += 1; status = DCMF_GlobalAllreduce(&ga_protocol, &request, done_callback, DCMF_SEQUENTIAL_CONSISTENCY, -1, (char *) buffer, (char *) buffer, msgsize/sizeof(int), DCMF_SIGNED_INT, DCMF_PROD); while(ga_active > 0) DCMF_Messager_advance(); expected = 1; for (i = 0; i < msgsize/sizeof(int); i++) { if(buffer[i] - expected != 0) { printf("[%d] Validation has failed Expected: %d, Actual: %d, i: %d \n", rank, expected, buffer[i], i); fflush(stdout); exit(-1); } } printf("[%d] %d message product reduce successful\n", rank, msgsize); fflush(stdout); } free(buffer); DCMF_Messager_finalize(); return 0; }
void ARMCIX_DCMF_Connection_initialize () { DCMF_CriticalSection_enter(0); __global_connection.peer = (unsigned) -1; unsigned rank = DCMF_Messager_rank (); unsigned size = DCMF_Messager_size (); posix_memalign ((void **)&__connection, 16, sizeof(ARMCIX_DCMF_Connection_t) * size); bzero ((void *)__connection, sizeof(ARMCIX_DCMF_Connection_t) * size); void * base = NULL; size_t bytes = (size_t) -1; unsigned i; for (i = 0; i < size; i++) { __connection[i].peer = i; #warning fix memregion setup to handle non-global address space pinning. //DCMF_Result result = DCMF_Memregion_create (&__connection[i].local_mem_region, &bytes, (size_t) -1, NULL, 0); } // Register a send protocol to exchange memory regions DCMF_Protocol_t send_protocol; DCMF_Send_Configuration_t send_configuration = { DCMF_DEFAULT_SEND_PROTOCOL, DCMF_DEFAULT_NETWORK, ARMCIX_DCMF_RecvMemregion1, __connection, ARMCIX_DCMF_RecvMemregion2, __connection }; DCMF_Send_register (&send_protocol, &send_configuration); DCMF_Request_t request; volatile unsigned active; DCMF_Callback_t cb_done = { ARMCIX_DCMF_cb_decrement, (void *) &active }; // Exchange the memory regions __memregions_to_receive = size; for (i = 0; i < size; i++) { unsigned peer = (rank+i)%size; active = 1; DCMF_Send (&send_protocol, &request, cb_done, DCMF_SEQUENTIAL_CONSISTENCY, peer, sizeof(DCMF_Memregion_t), (char *) &__connection[peer].local_mem_region, (DCQuad *) NULL, 0); while (active) DCMF_Messager_advance(); } while (__memregions_to_receive) DCMF_Messager_advance(); DCMF_CriticalSection_exit(0); }
void put_restart() { if (myrank == 0) { DCMF_Request_t put_req[ITERATIONS + SKIP]; DCMF_Callback_t put_done, put_ack; int done_count, ack_count; int msgsize, i; put_done.function = done; put_done.clientdata = (void *) &done_count; put_ack.function = done; put_ack.clientdata = (void *) &ack_count; char buffer[50]; sprintf(buffer, "%20s %20s %20s", "Msg Size", "Latency(usec)", "Restart-latency(usec)"); printf("%s \n", buffer); fflush(stdout); barrier(); for (msgsize = 1; msgsize < MAX_MSG_SIZE; msgsize *= 2) { /*********************** * warmup * ***********************/ ack_count = SKIP; for (i = 0; i < SKIP; i++) { DCMF_Put(&put_reg, &put_req[i], put_done, DCMF_SEQUENTIAL_CONSISTENCY, 1, msgsize, memregion[myrank], memregion[myrank + 1], i * msgsize, i * msgsize, put_ack); } while (ack_count) DCMF_Messager_advance(); /*********************** * start timer * ***********************/ t_start = DCMF_Timebase(); ack_count = ITERATIONS; for (i = SKIP; i < ITERATIONS + SKIP; i++) { DCMF_Put(&put_reg, &put_req[i], put_done, DCMF_SEQUENTIAL_CONSISTENCY, 1, msgsize, memregion[myrank], memregion[myrank + 1], i * msgsize, i * msgsize, put_ack); } while (ack_count) DCMF_Messager_advance(); t_stop = DCMF_Timebase(); /*********************** * stop timer * ***********************/ t_usec = ((t_stop - t_start) / clockMHz); printf("%20d %20.0f ", msgsize, t_usec / (ITERATIONS)); /*********************** * start timer * ***********************/ t_start = DCMF_Timebase(); ack_count = ITERATIONS; for (i = SKIP; i < ITERATIONS + SKIP; i++) { DCMF_Restart(&put_req[i]); } while (ack_count) DCMF_Messager_advance(); t_stop = DCMF_Timebase(); /*********************** * stop timer * ***********************/ t_usec = ((t_stop - t_start) / clockMHz); printf("%20.0f\n", t_usec / (ITERATIONS)); } barrier(); } else { barrier(); barrier(); } }
void send_remoteadvance() { DCMF_Request_t *send_req; DCMF_Callback_t send_done; int done_count; unsigned int msgsize, i, dst; DCQuad msginfo; send_req = (DCMF_Request_t *) malloc(sizeof(DCMF_Request_t) * ITERATIONS_LOCAL); send_done.function = done; send_done.clientdata = (void *) &done_count; if (myrank == 0) { printf("Send latency in usec\n"); fflush(stdout); } if (myrank == 0) { char buffer[100]; sprintf(buffer, "%20s %20s %20s", "Msg Size", "Send-Remote Barrier", "Send-Remote Sleep"); printf("%s \n", buffer); fflush(stdout); } if (myrank == 0) { for (msgsize = 1; msgsize < MAX_MSG_SIZE_LOCAL; msgsize *= 2) { /*********************** * start timer * ***********************/ t_start = DCMF_Timebase(); done_count = 10000; for (i = 0; i < ITERATIONS_LOCAL; i++) { DCMF_Send(&snd_reg, &send_req[i], send_done, DCMF_SEQUENTIAL_CONSISTENCY, (myrank + 1) % nranks, msgsize, source, &msginfo, 1); } while (done_count > 0) DCMF_Messager_advance(); t_stop = DCMF_Timebase(); t_usec = (t_stop - t_start) / (clockMHz * ITERATIONS_LOCAL); /*********************** * stop timer * ***********************/ if (myrank == 0) { printf("%20d %20.2f ", msgsize, t_usec); fflush(stdout); } barrier(); /*********************** * start timer * ***********************/ t_start = DCMF_Timebase(); done_count = 10000; for (i = 0; i < ITERATIONS_LOCAL; i++) { DCMF_Send(&snd_reg, &send_req[i], send_done, DCMF_SEQUENTIAL_CONSISTENCY, (myrank + 1) % nranks, msgsize, source, &msginfo, 1); } while (done_count > 0) DCMF_Messager_advance(); t_stop = DCMF_Timebase(); t_usec = (t_stop - t_start) / (clockMHz * ITERATIONS_LOCAL); /*********************** * stop timer * ***********************/ if (myrank == 0) { printf("%20.2f \n", t_usec); fflush(stdout); } barrier(); } } else { for (msgsize = 1; msgsize < MAX_MSG_SIZE_LOCAL; msgsize *= 2) { barrier(); DCMF_CriticalSection_enter(0); sleep(10); DCMF_CriticalSection_exit(0); barrier(); } } barrier(); }
void send_localvsremote() { DCMF_Request_t send_req[ITERATIONS]; DCMF_Callback_t send_done, nocallback; int done_count; unsigned int msgsize, i, dst; DCMF_NetworkCoord_t myaddr, dstaddr; DCMF_Network ntwk; DCQuad msginfo[ITERATIONS]; DCMF_Messager_rank2network(myrank, DCMF_TORUS_NETWORK, &myaddr); dstaddr.network = myaddr.network; dstaddr.torus.x = (myaddr.torus.x + 3) % 8; dstaddr.torus.y = (myaddr.torus.y + 3) % 8; dstaddr.torus.z = (myaddr.torus.z + 3) % 8; dstaddr.torus.t = myaddr.torus.t; DCMF_Messager_network2rank(&dstaddr, &dst, &ntwk); send_done.function = done; send_done.clientdata = (void *) &done_count; nocallback.function = NULL; nocallback.clientdata = NULL; if (myrank == 0) { printf("Send call overhead in usec\n"); fflush(stdout); } if (myrank == 0) { char buffer[100]; sprintf(buffer, "%20s %20s %20s", "Msg Size", "Farthest pairs", "Closest pairs"); printf("%s \n", buffer); fflush(stdout); } for (msgsize = 1; msgsize < MAX_MSG_SIZE; msgsize *= 2) { /*********************** * warmup * ***********************/ snd_rcv_active += SKIP; done_count += SKIP; for (i = 0; i < SKIP; i++) { DCMF_Send(&snd_reg, &send_req[i], send_done, DCMF_SEQUENTIAL_CONSISTENCY, dst, msgsize, source + i * msgsize, &msginfo[i], 1); } while (done_count || snd_rcv_active) DCMF_Messager_advance(); t_avg = 0; t_avg1 = 0, t_avg2 = 0; target_index = 0; barrier(); snd_rcv_active += ITERATIONS; t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS; i++) { DCMF_Send(&snd_reg, &send_req[i], nocallback, DCMF_SEQUENTIAL_CONSISTENCY, dst, msgsize, source + i * msgsize, &msginfo[i], 1); } t_stop = DCMF_Timebase(); t_usec = (t_stop - t_start) / (clockMHz * ITERATIONS); while (snd_rcv_active) DCMF_Messager_advance(); barrier(); allreduce(-1, (char *) &t_usec, (char *) &t_avg, 1, DCMF_DOUBLE, DCMF_SUM); barrier(); target_index = 0; snd_rcv_active += ITERATIONS; t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS; i++) { DCMF_Send(&snd_reg, &send_req[i], nocallback, DCMF_SEQUENTIAL_CONSISTENCY, (myrank + 1) % nranks, msgsize, source + i * msgsize, &msginfo[i], 1); } t_stop = DCMF_Timebase(); t_usec1 = (t_stop - t_start) / (clockMHz * ITERATIONS); while (snd_rcv_active) DCMF_Messager_advance(); barrier(); allreduce(-1, (char *) &t_usec1, (char *) &t_avg1, 1, DCMF_DOUBLE, DCMF_SUM); barrier(); if (myrank == 0) { t_avg = t_avg / nranks; t_avg1 = t_avg1 / nranks; printf("%20d %20.2f %20.2f \n", msgsize, t_avg, t_avg1); fflush(stdout); } } if (myrank == 0) { printf("Send latency in usec with local vs remote completion \n"); fflush(stdout); } if (myrank == 0) { char buffer[100]; sprintf(buffer, "%20s %20s %20s %20s %20s %20s %20s", "Msg Size", "Farthest pairs-local", "Farthest pairs-remote", "Farthest pairs-both", "Closest pairs-local", "Closest pairs-remote", "Closest pairs-both"); printf("%s \n", buffer); fflush(stdout); } barrier(); for (msgsize = 1; msgsize < MAX_MSG_SIZE; msgsize *= 2) { /*********************** * start timer * ***********************/ snd_rcv_active += ITERATIONS; t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS; i++) { done_count = 1; DCMF_Send(&snd_reg, &send_req[i], send_done, DCMF_SEQUENTIAL_CONSISTENCY, dst, msgsize, source + i * msgsize, &msginfo[i], 1); while (done_count) DCMF_Messager_advance(); } t_stop = DCMF_Timebase(); t_usec = (t_stop - t_start) / (clockMHz * ITERATIONS); while (snd_rcv_active) DCMF_Messager_advance(); barrier(); allreduce(-1, (char *) &t_usec, (char *) &t_avg, 1, DCMF_DOUBLE, DCMF_SUM); barrier(); target_index = 0; t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS; i++) { ack_rcv_active = 1; DCMF_Send(&rcb_snd_reg, &send_req[i], nocallback, DCMF_SEQUENTIAL_CONSISTENCY, dst, msgsize, source + i * msgsize, &msginfo[i], 1); while (ack_rcv_active) DCMF_Messager_advance(); } t_stop = DCMF_Timebase(); t_usec1 = (t_stop - t_start) / (clockMHz * ITERATIONS); barrier(); allreduce(-1, (char *) &t_usec1, (char *) &t_avg1, 1, DCMF_DOUBLE, DCMF_SUM); barrier(); target_index = 0; t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS; i++) { done_count = 1; ack_rcv_active = 1; DCMF_Send(&rcb_snd_reg, &send_req[i], send_done, DCMF_SEQUENTIAL_CONSISTENCY, dst, msgsize, source + i * msgsize, &msginfo[i], 1); while (done_count || ack_rcv_active) DCMF_Messager_advance(); } t_stop = DCMF_Timebase(); t_usec2 = (t_stop - t_start) / (clockMHz * ITERATIONS); /*********************** * stop timer * ***********************/ barrier(); allreduce(-1, (char *) &t_usec2, (char *) &t_avg2, 1, DCMF_DOUBLE, DCMF_SUM); barrier(); if (myrank == 0) { t_avg = t_avg / nranks; t_avg1 = t_avg1 / nranks; t_avg2 = t_avg2 / nranks; printf("%20d %20.2f %20.2f %20.2f", msgsize, t_avg, t_avg1, t_avg2); fflush(stdout); } t_avg = 0; t_avg1 = 0, t_avg2 = 0; target_index = 0; barrier(); /*********************** * start timer * ***********************/ snd_rcv_active += ITERATIONS; t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS; i++) { done_count = 1; DCMF_Send(&snd_reg, &send_req[i], send_done, DCMF_SEQUENTIAL_CONSISTENCY, (myrank + 1) % nranks, msgsize, source + i * msgsize, &msginfo[i], 1); while (done_count) DCMF_Messager_advance(); } t_stop = DCMF_Timebase(); t_usec = (t_stop - t_start) / (clockMHz * ITERATIONS); while (snd_rcv_active) DCMF_Messager_advance(); barrier(); allreduce(-1, (char *) &t_usec, (char *) &t_avg, 1, DCMF_DOUBLE, DCMF_SUM); barrier(); target_index = 0; t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS; i++) { ack_rcv_active = 1; DCMF_Send(&rcb_snd_reg, &send_req[i], nocallback, DCMF_SEQUENTIAL_CONSISTENCY, (myrank + 1) % nranks, msgsize, source + i * msgsize, &msginfo[i], 1); while (ack_rcv_active) DCMF_Messager_advance(); } t_stop = DCMF_Timebase(); t_usec1 = (t_stop - t_start) / (clockMHz * ITERATIONS); barrier(); allreduce(-1, (char *) &t_usec1, (char *) &t_avg1, 1, DCMF_DOUBLE, DCMF_SUM); barrier(); target_index = 0; t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS; i++) { done_count = 1; ack_rcv_active = 1; DCMF_Send(&rcb_snd_reg, &send_req[i], send_done, DCMF_SEQUENTIAL_CONSISTENCY, (myrank + 1) % nranks, msgsize, source + i * msgsize, &msginfo[i], 1); while (done_count || ack_rcv_active) DCMF_Messager_advance(); } t_stop = DCMF_Timebase(); t_usec2 = (t_stop - t_start) / (clockMHz * ITERATIONS); /*********************** * stop timer * ***********************/ allreduce(-1, (char *) &t_usec2, (char *) &t_avg2, 1, DCMF_DOUBLE, DCMF_SUM); barrier(); if (myrank == 0) { t_avg = t_avg / nranks; t_avg1 = t_avg1 / nranks; t_avg2 = t_avg2 / nranks; printf("%20.2f %20.2f %20.2f \n", t_avg, t_avg1, t_avg2); fflush(stdout); } } }
void memcpyvsput() { DCMF_Request_t put_req[ITERATIONS_LOCAL]; DCMF_Callback_t put_done, put_ack; int done_count, ack_count; unsigned int msgsize, i, dst; put_done.function = done; put_done.clientdata = (void *) &done_count; put_ack.function = done; put_ack.clientdata = (void *) &ack_count; if (myrank == 0) { char buffer[200]; sprintf(buffer, "%20s %20s %20s %30s %20s %20s", "Msg Size", "DCMF_Put_Internode", "DCMF_Put_Intranode", "DCMF_Put_Intranode (Busy DMA)", "Memcpy", "Memcpy (Busy DMA)"); printf("%s \n", buffer); fflush(stdout); } barrier(); if (myrank == 0) { for (msgsize = 1; msgsize <= MAX_MSG_SIZE_LOCAL; msgsize *= 2) { /*********************** * start timer * ***********************/ t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS_LOCAL; i++) { done_count = 1; ack_count = 1; DCMF_Put(&put_reg, &put_req[i], put_done, DCMF_SEQUENTIAL_CONSISTENCY, 2, msgsize, memregion[0], memregion[2], i * msgsize, i * msgsize, put_ack); while (done_count > 0 || ack_count > 0) DCMF_Messager_advance(); } t_stop = DCMF_Timebase(); t_usec = ((t_stop - t_start) / clockMHz) / ITERATIONS_LOCAL; /*********************** * stop timer * ***********************/ if (myrank == 0) { printf("%20d %20.2f ", msgsize, t_usec); fflush(stdout); } /*********************** * start timer * ***********************/ t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS_LOCAL; i++) { done_count = 1; ack_count = 1; DCMF_Put(&put_reg, &put_req[i], put_done, DCMF_SEQUENTIAL_CONSISTENCY, 1, msgsize, memregion[0], memregion[1], i * msgsize, i * msgsize, put_ack); while (done_count > 0 || ack_count > 0) DCMF_Messager_advance(); } t_stop = DCMF_Timebase(); t_usec = ((t_stop - t_start) / clockMHz) / ITERATIONS_LOCAL; /*********************** * stop timer * ***********************/ if (myrank == 0) { printf("%20.2f ", t_usec); fflush(stdout); } /*********************** * start timer * ***********************/ barrier(); t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS_LOCAL; i++) { done_count = 1; ack_count = 1; DCMF_Put(&put_reg, &put_req[i], put_done, DCMF_SEQUENTIAL_CONSISTENCY, 1, msgsize, memregion[0], memregion[1], i * msgsize, i * msgsize, put_ack); while (done_count > 0 || ack_count > 0) DCMF_Messager_advance(); } t_stop = DCMF_Timebase(); t_usec = ((t_stop - t_start) / clockMHz) / ITERATIONS_LOCAL; /*********************** * stop timer * ***********************/ if (myrank == 0) { printf("%28.2f ", t_usec); fflush(stdout); } /*********************** * start timer * ***********************/ t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS_LOCAL; i++) { memcpy(window + ITERATIONS_LOCAL * MAX_MSG_SIZE_LOCAL + i * msgsize, window + i * msgsize, msgsize); } t_stop = DCMF_Timebase(); t_usec = ((t_stop - t_start) / clockMHz) / ITERATIONS_LOCAL; /*********************** * stop timer * ***********************/ printf("%20.2f ", t_usec); fflush(stdout); /*********************** * start timer * ***********************/ barrier(); t_start = DCMF_Timebase(); for (i = 0; i < ITERATIONS_LOCAL; i++) { memcpy(window + ITERATIONS_LOCAL * MAX_MSG_SIZE_LOCAL + i * msgsize, window + i * msgsize, msgsize); } t_stop = DCMF_Timebase(); t_usec = ((t_stop - t_start) / clockMHz) / ITERATIONS_LOCAL; /*********************** * stop timer * ***********************/ printf("%20.2f \n", t_usec); fflush(stdout); } } else { for (msgsize = 1; msgsize <= MAX_MSG_SIZE_LOCAL; msgsize *= 2) { barrier(); for (i = 0; i < ITERATIONS_LOCAL; i++) { done_count = 1; ack_count = 1; DCMF_Put(&put_reg, &put_req[i], put_done, DCMF_SEQUENTIAL_CONSISTENCY, 0, msgsize, memregion[myrank], memregion[0], i * msgsize, i * msgsize, put_ack); while (done_count > 0 || ack_count > 0) DCMF_Messager_advance(); } barrier(); for (i = 0; i < ITERATIONS_LOCAL; i++) { done_count = 1; ack_count = 1; DCMF_Put(&put_reg, &put_req[i], put_done, DCMF_SEQUENTIAL_CONSISTENCY, 0, msgsize, memregion[myrank], memregion[0], i * msgsize, i * msgsize, put_ack); while (done_count > 0 || ack_count > 0) DCMF_Messager_advance(); } } } barrier(); }
int main(int argc, void* argv[]) { DCMF_Configure_t config; config.thread_level = DCMF_THREAD_MULTIPLE; DCMF_Messager_initialize(); DCMF_Messager_configure(&config, &config); init(); if (nranks != (THREAD_NUM + 1)) { printf("This test requires only %d processes \n", (THREAD_NUM + 1)); fflush(stdout); return -1; } barrier_init(DCMF_DEFAULT_GLOBALBARRIER_PROTOCOL); control_init(DCMF_DEFAULT_CONTROL_PROTOCOL, DCMF_DEFAULT_NETWORK); memregion_init(LOCAL_MAX_BUF_SIZE * THREAD_NUM); get_init(DCMF_DEFAULT_PUT_PROTOCOL, DCMF_TORUS_NETWORK); source = (char *) malloc(LOCAL_MAX_BUF_SIZE * THREAD_NUM); target = (char *) malloc(LOCAL_MAX_BUF_SIZE * THREAD_NUM); send_init(DCMF_DEFAULT_SEND_PROTOCOL, DCMF_TORUS_NETWORK); int status; long i; if (myrank == 0) { pthread_t threads[THREAD_NUM]; pthread_barrier_init(&ptbarrier, NULL, THREAD_NUM); pthread_barrier_init(&ptbarrier1, NULL, THREAD_NUM); for (i = 0; i < THREAD_NUM; i++) { pthread_create(&threads[i], NULL, mrate_test, (void *) i); } for (i = 0; i < THREAD_NUM; i++) { pthread_join(threads[i], (void *) &status); } } else { snd_rcv_active += LOCAL_ITERATIONS; while (snd_rcv_active > 0) DCMF_Messager_advance(); } barrier(); DCMF_Messager_finalize(); if (myrank == 0) { printf("Benchmark Complete \n"); fflush(stdout); } return (0); }