/* * Progress the event library and any functions that have registered to * be called. We don't propogate errors from the progress functions, * so no action is taken if they return failures. The functions are * expected to return the number of events progressed, to determine * whether or not we should call sched_yield() during MPI progress. * This is only losely tracked, as an error return can cause the number * of progressed events to appear lower than it actually is. We don't * care, as the cost of that happening is far outweighed by the cost * of the if checks (they were resulting in bad pipe stalling behavior) */ void opal_progress(void) { size_t i; int events = 0; if( opal_progress_event_flag != 0 ) { #if OPAL_HAVE_WORKING_EVENTOPS #if OPAL_PROGRESS_USE_TIMERS #if OPAL_TIMER_USEC_NATIVE opal_timer_t now = opal_timer_base_get_usec(); #else opal_timer_t now = opal_timer_base_get_cycles(); #endif /* OPAL_TIMER_USEC_NATIVE */ /* trip the event library if we've reached our tick rate and we are enabled */ if (now - event_progress_last_time > event_progress_delta ) { event_progress_last_time = (num_event_users > 0) ? now - event_progress_delta : now; events += opal_event_loop(opal_sync_event_base, opal_progress_event_flag); } #else /* OPAL_PROGRESS_USE_TIMERS */ /* trip the event library if we've reached our tick rate and we are enabled */ if (OPAL_THREAD_ADD32(&event_progress_counter, -1) <= 0 ) { event_progress_counter = (num_event_users > 0) ? 0 : event_progress_delta; events += opal_event_loop(opal_sync_event_base, opal_progress_event_flag); } #endif /* OPAL_PROGRESS_USE_TIMERS */ #endif /* OPAL_HAVE_WORKING_EVENTOPS */ } /* progress all registered callbacks */ for (i = 0 ; i < callbacks_len ; ++i) { events += (callbacks[i])(); } #if OPAL_HAVE_SCHED_YIELD if (opal_progress_yield_when_idle && events <= 0) { /* If there is nothing to do - yield the processor - otherwise * we could consume the processor for the entire time slice. If * the processor is oversubscribed - this will result in a best-case * latency equivalent to the time-slice. */ sched_yield(); } #endif /* defined(HAVE_SCHED_YIELD) */ }
/*************** * Pretty Print ***************/ static double get_time(void) { double wtime; #if OPAL_TIMER_USEC_NATIVE wtime = (double)opal_timer_base_get_usec() / 1000000.0; #else struct timeval tv; gettimeofday(&tv, NULL); wtime = tv.tv_sec; wtime += (double)tv.tv_usec / 1000000.0; #endif return wtime; }
double MPI_Wtime(void) { double wtime; #if OPAL_TIMER_CYCLE_NATIVE wtime = ((double) opal_timer_base_get_cycles()) / opal_timer_base_get_freq(); #elif OPAL_TIMER_USEC_NATIVE wtime = ((double) opal_timer_base_get_usec()) / 1000000.0; #else /* Fall back to gettimeofday() if we have nothing else */ struct timeval tv; gettimeofday(&tv, NULL); wtime = tv.tv_sec; wtime += (double)tv.tv_usec / 1000000.0; #endif OPAL_CR_NOOP_PROGRESS(); return wtime; }
void opal_progress_set_event_poll_rate(int polltime) { OPAL_OUTPUT((debug_output, "progress: progress_set_event_poll_rate(%d)", polltime)); #if OPAL_PROGRESS_USE_TIMERS event_progress_delta = 0; # if OPAL_TIMER_USEC_NATIVE event_progress_last_time = opal_timer_base_get_usec(); # else event_progress_last_time = opal_timer_base_get_cycles(); # endif #else event_progress_counter = event_progress_delta = 0; #endif if (polltime == 0) { #if OPAL_PROGRESS_USE_TIMERS /* user specified as never tick - tick once per minute */ event_progress_delta = 60 * 1000000; #else /* user specified as never tick - don't count often */ event_progress_delta = INT_MAX; #endif } else { #if OPAL_PROGRESS_USE_TIMERS event_progress_delta = polltime; #else /* subtract one so that we can do post-fix subtraction in the inner loop and go faster */ event_progress_delta = polltime - 1; #endif } #if OPAL_PROGRESS_USE_TIMERS && !OPAL_TIMER_USEC_NATIVE /* going to use cycles for counter. Adjust specified usec into cycles */ event_progress_delta = event_progress_delta * opal_timer_base_get_freq() / 1000000; #endif }
double MPI_Wtime(void) { double wtime; /* * See https://github.com/open-mpi/ompi/issues/3003 to find out * what's happening here. */ #if 0 #if OPAL_TIMER_CYCLE_NATIVE wtime = ((double) opal_timer_base_get_cycles()) / opal_timer_base_get_freq(); #elif OPAL_TIMER_USEC_NATIVE wtime = ((double) opal_timer_base_get_usec()) / 1000000.0; #endif #else #if defined(__linux__) && OPAL_HAVE_CLOCK_GETTIME struct timespec tp; (void) clock_gettime(CLOCK_MONOTONIC, &tp); if( OPAL_UNLIKELY(0 == ompi_wtime_time_origin.tv_sec) ) { ompi_wtime_time_origin = tp; } wtime = (double)(tp.tv_nsec - ompi_wtime_time_origin.tv_nsec)/1.0e+9; wtime += (tp.tv_sec - ompi_wtime_time_origin.tv_sec); #else /* Fall back to gettimeofday() if we have nothing else */ struct timeval tv; gettimeofday(&tv, NULL); if( OPAL_UNLIKELY(0 == ompi_wtime_time_origin.tv_sec) ) { ompi_wtime_time_origin = tv; } wtime = (double)(tv.tv_usec - ompi_wtime_time_origin.tv_usec) / 1.0e+6; wtime += (tv.tv_sec - ompi_wtime_time_origin.tv_sec); #endif #endif OPAL_CR_NOOP_PROGRESS(); return wtime; }
static int opal_progress_events(void) { static opal_atomic_int32_t lock = 0; int events = 0; if( opal_progress_event_flag != 0 && !OPAL_THREAD_SWAP_32(&lock, 1) ) { #if OPAL_HAVE_WORKING_EVENTOPS #if OPAL_PROGRESS_USE_TIMERS #if OPAL_PROGRESS_ONLY_USEC_NATIVE opal_timer_t now = opal_timer_base_get_usec(); #else opal_timer_t now = opal_timer_base_get_cycles(); #endif /* OPAL_PROGRESS_ONLY_USEC_NATIVE */ /* trip the event library if we've reached our tick rate and we are enabled */ if (now - event_progress_last_time > event_progress_delta ) { event_progress_last_time = (num_event_users > 0) ? now - event_progress_delta : now; events += opal_event_loop(opal_sync_event_base, opal_progress_event_flag); } #else /* OPAL_PROGRESS_USE_TIMERS */ /* trip the event library if we've reached our tick rate and we are enabled */ if (OPAL_THREAD_ADD_FETCH32(&event_progress_counter, -1) <= 0 ) { event_progress_counter = (num_event_users > 0) ? 0 : event_progress_delta; events += opal_event_loop(opal_sync_event_base, opal_progress_event_flag); } #endif /* OPAL_PROGRESS_USE_TIMERS */ #endif /* OPAL_HAVE_WORKING_EVENTOPS */ lock = 0; } return events; }
double MPI_Wtime(void) { double wtime; #if OPAL_TIMER_USEC_NATIVE /* We may or may not have native usec precision on Windows, so put this #if before the #ifdef checking for Windows. */ wtime = ((double) opal_timer_base_get_usec()) / 1000000.0; #elif defined(__WINDOWS__) wtime = ((double) opal_timer_base_get_cycles()) / ((double) opal_timer_base_get_freq()); #else /* Fall back to gettimeofday() if we have nothing else */ struct timeval tv; gettimeofday(&tv, NULL); wtime = tv.tv_sec; wtime += (double)tv.tv_usec / 1000000.0; #endif OPAL_CR_NOOP_PROGRESS(); return wtime; }
/* init the progress engine - called from orte_init */ int opal_progress_init(void) { /* reentrant issues */ #if OMPI_HAVE_THREAD_SUPPORT opal_atomic_init(&progress_lock, OPAL_ATOMIC_UNLOCKED); #endif /* OMPI_HAVE_THREAD_SUPPORT */ /* always call sched yield when in the rte only... */ call_yield = 1; #if OPAL_PROGRESS_USE_TIMERS event_progress_delta = 0; #if OPAL_TIMER_USEC_NATIVE event_progress_last_time = opal_timer_base_get_usec(); #else event_progress_last_time = opal_timer_base_get_cycles(); #endif #else event_progress_counter = event_progress_delta = 0; #endif return OPAL_SUCCESS; }
static int ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev, ompi_mtl_portals4_rndv_get_frag_t* rndv_get_frag) { int ret; ompi_mtl_portals4_recv_request_t* ptl_request = (ompi_mtl_portals4_recv_request_t*) rndv_get_frag->request; assert(PTL_EVENT_REPLY == ev->type); OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu (0x%lx) got reply event", ptl_request->opcount, ptl_request->hdr_data)); if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_OK)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: PTL_EVENT_REPLY with ni_fail_type: %d", __FILE__, __LINE__, ev->ni_fail_type); if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_DROPPED)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "PTL_EVENT_REPLY with ni_fail_type: %u => cannot retry", (uint32_t)ev->ni_fail_type); ret = PTL_FAIL; goto callback_error; } if (0 == rndv_get_frag->frag_abs_timeout_usec) { /* this is the first retry of the frag. start the timer. */ /* instead of recording the start time, record the end time * and avoid addition on each retry. */ rndv_get_frag->frag_abs_timeout_usec = opal_timer_base_get_usec() + ompi_mtl_portals4.get_retransmit_timeout; opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "setting frag timeout at %lu", rndv_get_frag->frag_abs_timeout_usec); } else if (opal_timer_base_get_usec() >= rndv_get_frag->frag_abs_timeout_usec) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "timeout retrying GET"); ret = PTL_FAIL; goto callback_error; } OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Rendezvous Get Failed: Reissuing frag #%u", rndv_get_frag->frag_num)); ret = PtlGet(ompi_mtl_portals4.send_md_h, (ptl_size_t) rndv_get_frag->frag_start, rndv_get_frag->frag_length, rndv_get_frag->frag_target, ompi_mtl_portals4.read_idx, rndv_get_frag->frag_match_bits, rndv_get_frag->frag_remote_offset, rndv_get_frag); if (OPAL_UNLIKELY(PTL_OK != ret)) { if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); goto callback_error; } return OMPI_SUCCESS; } /* set the received length in the status, now that we know exactly how much data was sent. */ ptl_request->super.super.ompi_req->req_status._ucount += ev->mlength; /* this frag is complete. return to freelist. */ opal_free_list_return (&ompi_mtl_portals4.fl_rndv_get_frag, &rndv_get_frag->super); ret = OPAL_THREAD_ADD32(&(ptl_request->pending_reply), -1); if (ret > 0) { return OMPI_SUCCESS; } assert(ptl_request->pending_reply == 0); #if OMPI_MTL_PORTALS4_FLOW_CONTROL OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, 1); #endif /* make sure the data is in the right place. Use _ucount for the total length because it will be set correctly for all three protocols. mlength is only correct for eager, and delivery_len is the length of the buffer, not the length of the send. */ ret = ompi_mtl_datatype_unpack(ptl_request->convertor, ptl_request->delivery_ptr, ptl_request->super.super.ompi_req->req_status._ucount); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: ompi_mtl_datatype_unpack failed: %d", __FILE__, __LINE__, ret); ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ret; } OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu (0x%lx) completed , reply (pending_reply: %d)", ptl_request->opcount, ptl_request->hdr_data, ptl_request->pending_reply)); ptl_request->super.super.completion_callback(&ptl_request->super.super); return OMPI_SUCCESS; callback_error: ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ompi_mtl_portals4_get_error(ret); ptl_request->super.super.completion_callback(&ptl_request->super.super); return OMPI_SUCCESS; }
/* * Progress the event library and any functions that have registered to * be called. We don't propogate errors from the progress functions, * so no action is taken if they return failures. The functions are * expected to return the number of events progressed, to determine * whether or not we should call sched_yield() during MPI progress. * This is only losely tracked, as an error return can cause the number * of progressed events to appear lower than it actually is. We don't * care, as the cost of that happening is far outweighed by the cost * of the if checks (they were resulting in bad pipe stalling behavior) */ void opal_progress(void) { size_t i; int events = 0; #if OPAL_HAVE_THREAD_SUPPORT opal_atomic_add(&opal_progress_recursion_depth_counter, 1); #else ++opal_progress_recursion_depth_counter; #endif if( opal_progress_event_flag != 0 ) { #if (OPAL_ENABLE_PROGRESS_THREADS == 0) && OPAL_HAVE_WORKING_EVENTOPS #if OPAL_PROGRESS_USE_TIMERS #if OPAL_TIMER_USEC_NATIVE opal_timer_t now = opal_timer_base_get_usec(); #else opal_timer_t now = opal_timer_base_get_cycles(); #endif /* OPAL_TIMER_USEC_NATIVE */ /* trip the event library if we've reached our tick rate and we are enabled */ if (now - event_progress_last_time > event_progress_delta ) { event_progress_last_time = (num_event_users > 0) ? now - event_progress_delta : now; events += opal_event_loop(opal_progress_event_flag); } #else /* OPAL_PROGRESS_USE_TIMERS */ /* trip the event library if we've reached our tick rate and we are enabled */ if (OPAL_THREAD_ADD32(&event_progress_counter, -1) <= 0 ) { event_progress_counter = (num_event_users > 0) ? 0 : event_progress_delta; events += opal_event_loop(opal_progress_event_flag); } #endif /* OPAL_PROGRESS_USE_TIMERS */ #endif /* OPAL_ENABLE_PROGRESS_THREADS == 0 && OPAL_HAVE_WORKING_EVENTOPS */ } /* progress all registered callbacks */ for (i = 0 ; i < callbacks_len ; ++i) { events += (callbacks[i])(); } #if defined(__WINDOWS__) || defined(HAVE_SCHED_YIELD) if (call_yield && events <= 0) { /* If there is nothing to do - yield the processor - otherwise * we could consume the processor for the entire time slice. If * the processor is oversubscribed - this will result in a best-case * latency equivalent to the time-slice. */ #if defined(__WINDOWS__) SwitchToThread(); #else sched_yield(); #endif /* defined(__WINDOWS__) */ } #endif /* defined(__WINDOWS__) || defined(HAVE_SCHED_YIELD) */ #if OPAL_HAVE_THREAD_SUPPORT opal_atomic_add(&opal_progress_recursion_depth_counter, -1); #else --opal_progress_recursion_depth_counter; #endif }
/* turn on MPI optimizations */ int opal_progress_mpi_enable(void) { int param, value; /* call sched yield when oversubscribed. */ param = mca_base_param_find("mpi", NULL, "yield_when_idle"); mca_base_param_lookup_int(param, &value); if (value < 0) { /* this should never happen set to 1 if it somehow does */ call_yield = 1; } else { call_yield = value; } /* set the event tick rate */ param = mca_base_param_find("mpi", NULL, "event_tick_rate"); mca_base_param_lookup_int(param, &value); if (value < 0) { /* user didn't specify - default tick rate */ event_progress_delta = opal_progress_default_tick_rate; } else if (value == 0) { #if OPAL_PROGRESS_USE_TIMERS /* user specified as never tick - tick once per minute */ event_progress_delta = 60 * 1000000; #else /* user specified as never tick - don't count often */ event_progress_delta = INT_MAX; #endif } else { #if OPAL_PROGRESS_USE_TIMERS event_progress_delta = value; #else /* subtract one so that we can do post-fix subtraction in the inner loop and go faster */ event_progress_delta = value - 1; #endif } #if OPAL_PROGRESS_USE_TIMERS && !OPAL_TIMER_USEC_NATIVE /* going to use cycles for counter. Adjust specified usec into cycles */ event_progress_delta = event_progress_delta * opal_timer_base_get_freq() / 1000000; #endif #if OPAL_PROGRESS_USE_TIMERS #if OPAL_TIMER_USEC_NATIVE event_progress_last_time = opal_timer_base_get_usec(); #else event_progress_last_time = opal_timer_base_get_cycles(); #endif #else /* it's possible that an init function bumped up our tick rate. * If so, set the event_progress counter to 0. Otherwise, set it to * the reset value */ event_progress_counter = (event_num_mpi_users > 0) ? 0 : event_progress_delta; #endif return OPAL_SUCCESS; }