void tw_gvt_step2(tw_pe *me) { long long local_white = 0; long long total_white = 0; tw_stime pq_min = DBL_MAX; tw_stime net_min = DBL_MAX; tw_stime lvt; tw_stime gvt; tw_clock start = tw_clock_read(); if(me->gvt_status != TW_GVT_COMPUTE) return; while(1) { tw_net_read(me); // send message counts to create consistent cut local_white = me->s_nwhite_sent - me->s_nwhite_recv; all_reduce_cnt++; if(MPI_Allreduce( &local_white, &total_white, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS) tw_error(TW_LOC, "MPI_Allreduce for GVT failed"); if(total_white == 0) break; } pq_min = tw_pq_minimum(me->pq); net_min = tw_net_minimum(me); lvt = me->trans_msg_ts; if(lvt > pq_min) lvt = pq_min; if(lvt > net_min) lvt = net_min; all_reduce_cnt++; if(MPI_Allreduce( &lvt, &gvt, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD) != MPI_SUCCESS) tw_error(TW_LOC, "MPI_Allreduce for GVT failed"); gvt = ROSS_MIN(gvt, me->GVT_prev); if(gvt != me->GVT_prev) { g_tw_gvt_no_change = 0; } else { g_tw_gvt_no_change++; if (g_tw_gvt_no_change >= g_tw_gvt_max_no_change) { tw_error( TW_LOC, "GVT computed %d times in a row" " without changing: GVT = %14.14lf, PREV %14.14lf" " -- GLOBAL SYNCH -- out of memory!", g_tw_gvt_no_change, gvt, me->GVT_prev); } } if (me->GVT > gvt) { tw_error(TW_LOC, "PE %u GVT decreased %g -> %g", me->id, me->GVT, gvt); } if (gvt / g_tw_ts_end > percent_complete && (g_tw_mynode == g_tw_masternode)) { gvt_print(gvt); } me->s_nwhite_sent = 0; me->s_nwhite_recv = 0; me->trans_msg_ts = DBL_MAX; me->GVT_prev = DBL_MAX; // me->GVT; me->GVT = gvt; me->gvt_status = TW_GVT_NORMAL; gvt_cnt = 0; // update GVT timing stats me->stats.s_gvt += tw_clock_read() - start; // only FC if OPTIMISTIC if( g_tw_synchronization_protocol == OPTIMISTIC ) { start = tw_clock_read(); tw_pe_fossil_collect(me); me->stats.s_fossil_collect += tw_clock_read() - start; } g_tw_gvt_done++; }
/** * @brief Determines how to handle the newly received event. * * @param[in] me pointer to PE * @param[in] e pointer to event that we just received * @param[in] buffer not currently used */ static void recv_finish(tw_pe *me, tw_event *e, char * buffer) { (void) buffer; tw_pe *dest_pe; tw_clock start; me->stats.s_nread_network++; me->s_nwhite_recv++; // printf("recv_finish: remote event [cancel %u] FROM: LP %lu, PE %lu, TO: LP %lu, PE %lu at TS %lf \n", // e->state.cancel_q, (tw_lpid)e->src_lp, e->send_pe, (tw_lpid)e->dest_lp, me->id, e->recv_ts); e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); dest_pe = e->dest_lp->pe; // instrumentation e->dest_lp->kp->kp_stats->s_nread_network++; e->dest_lp->lp_stats->s_nread_network++; if(e->send_pe > tw_nnodes()-1) tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); e->cancel_next = NULL; e->caused_by_me = NULL; e->cause_next = NULL; if(e->recv_ts < me->GVT) tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", me->id, e->send_pe, e->recv_ts, e->state.cancel_q); if(tw_gvt_inprogress(me)) me->trans_msg_ts = ROSS_MIN(me->trans_msg_ts, e->recv_ts); // if cancel event, retrieve and flush // else, store in hash table if(e->state.cancel_q) { tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); // NOTE: it is possible to cancel the event we // are currently processing at this PE since this // MPI module lets me read cancel events during // event sends over the network. cancel->state.cancel_q = 1; cancel->state.remote = 0; cancel->cancel_next = dest_pe->cancel_q; dest_pe->cancel_q = cancel; tw_event_free(me, e); return; } if (g_tw_synchronization_protocol == OPTIMISTIC || g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || g_tw_synchronization_protocol == OPTIMISTIC_REALTIME ) { tw_hash_insert(me->hash_t, e, e->send_pe); e->state.remote = 1; } /* NOTE: the final check in the if conditional below was added to make sure * that we do not execute the fast case unless the cancellation queue is * empty on the destination PE. Otherwise we need to invoke the normal * scheduling routines to make sure that a forward event doesn't bypass a * cancellation event with an earlier timestamp. This is helpful for * stateful models that produce incorrect results when presented with * duplicate messages with no rollback between them. */ if(me == dest_pe && e->dest_lp->kp->last_time <= e->recv_ts && !dest_pe->cancel_q) { /* Fast case, we are sending to our own PE and * there is no rollback caused by this send. */ start = tw_clock_read(); tw_pq_enqueue(dest_pe->pq, e); dest_pe->stats.s_pq += tw_clock_read() - start; return; } if (me->id == dest_pe->id) { /* Slower, but still local send, so put into top * of dest_pe->event_q. */ e->state.owner = TW_pe_event_q; tw_eventq_push(&dest_pe->event_q, e); return; } /* Never should happen; MPI should have gotten the * message to the correct node without needing us * to redirect the message there for it. This is * probably a serious bug with the event headers * not being formatted right. */ tw_error( TW_LOC, "Event recived by PE %u but meant for PE %u", me->id, dest_pe->id); }
void tw_event_send(tw_event * event) { tw_lp *src_lp = event->src_lp; tw_pe *send_pe = src_lp->pe; tw_pe *dest_pe = NULL; tw_peid dest_peid = -1; tw_stime recv_ts = event->recv_ts; if (event == send_pe->abort_event) { if (recv_ts < g_tw_ts_end) { send_pe->cev_abort = 1; } return; } //Trap lookahead violations in debug mode //Note that compiling with the -DNDEBUG flag will turn this off! if (g_tw_synchronization_protocol == CONSERVATIVE) { if (recv_ts - tw_now(src_lp) < g_tw_lookahead) { tw_error(TW_LOC, "Lookahead violation: decrease g_tw_lookahead"); } } if (event->out_msgs) { tw_error(TW_LOC, "It is an error to send an event with pre-loaded output message."); } link_causality(event, send_pe->cur_event); // call LP remote mapping function to get dest_pe dest_peid = (*src_lp->type->map) ((tw_lpid) event->dest_lp); if (dest_peid == g_tw_mynode) { event->dest_lp = tw_getlocal_lp((tw_lpid) event->dest_lp); dest_pe = event->dest_lp->pe; if (send_pe == dest_pe && event->dest_lp->kp->last_time <= recv_ts) { /* Fast case, we are sending to our own PE and there is * no rollback caused by this send. We cannot have any * transient messages on local sends so we can return. */ tw_pq_enqueue(send_pe->pq, event); return; } else { /* Slower, but still local send, so put into top of * dest_pe->event_q. */ event->state.owner = TW_pe_event_q; tw_eventq_push(&dest_pe->event_q, event); if(send_pe != dest_pe) { send_pe->stats.s_nsend_loc_remote++; } } } else { /* Slowest approach of all; this is not a local event. * We need to send it over the network to the other PE * for processing. */ send_pe->stats.s_nsend_net_remote++; event->state.owner = TW_net_asend; tw_net_send(event); } if(tw_gvt_inprogress(send_pe)) { send_pe->trans_msg_ts = ROSS_MIN(send_pe->trans_msg_ts, recv_ts); } }
void tw_gvt_step2(tw_pe *me) { long long local_white = 0; long long total_white = 0; tw_stime pq_min = DBL_MAX; tw_stime net_min = DBL_MAX; tw_stime lvt; tw_stime gvt; tw_clock net_start; tw_clock start = tw_clock_read(); if(me->gvt_status != TW_GVT_COMPUTE) return; while(1) { net_start = tw_clock_read(); tw_net_read(me); me->stats.s_net_read += tw_clock_read() - net_start; // send message counts to create consistent cut local_white = me->s_nwhite_sent - me->s_nwhite_recv; all_reduce_cnt++; if(MPI_Allreduce( &local_white, &total_white, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_ROSS) != MPI_SUCCESS) tw_error(TW_LOC, "MPI_Allreduce for GVT failed"); if(total_white == 0) break; } pq_min = tw_pq_minimum(me->pq); net_min = tw_net_minimum(me); lvt = me->trans_msg_ts; if(lvt > pq_min) lvt = pq_min; if(lvt > net_min) lvt = net_min; all_reduce_cnt++; if(MPI_Allreduce( &lvt, &gvt, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_ROSS) != MPI_SUCCESS) tw_error(TW_LOC, "MPI_Allreduce for GVT failed"); gvt = ROSS_MIN(gvt, me->GVT_prev); if(gvt != me->GVT_prev) { g_tw_gvt_no_change = 0; } else { g_tw_gvt_no_change++; if (g_tw_gvt_no_change >= g_tw_gvt_max_no_change) { tw_error( TW_LOC, "GVT computed %d times in a row" " without changing: GVT = %14.14lf, PREV %14.14lf" " -- GLOBAL SYNCH -- out of memory!", g_tw_gvt_no_change, gvt, me->GVT_prev); } } if (me->GVT > gvt) { tw_error(TW_LOC, "PE %u GVT decreased %g -> %g", me->id, me->GVT, gvt); } if (gvt / g_tw_ts_end > percent_complete && (g_tw_mynode == g_tw_masternode)) { gvt_print(gvt); } me->s_nwhite_sent = 0; me->s_nwhite_recv = 0; me->trans_msg_ts = DBL_MAX; me->GVT_prev = DBL_MAX; // me->GVT; me->GVT = gvt; me->gvt_status = TW_GVT_NORMAL; gvt_cnt = 0; // update GVT timing stats me->stats.s_gvt += tw_clock_read() - start; // only FC if OPTIMISTIC or REALTIME, do not do for DEBUG MODE if( g_tw_synchronization_protocol == OPTIMISTIC || g_tw_synchronization_protocol == OPTIMISTIC_REALTIME ) { start = tw_clock_read(); tw_pe_fossil_collect(me); me->stats.s_fossil_collect += tw_clock_read() - start; } // do any necessary instrumentation calls if ((g_st_engine_stats == GVT_STATS || g_st_engine_stats == ALL_STATS) && g_tw_gvt_done % g_st_num_gvt == 0 && gvt <= g_tw_ts_end) { #ifdef USE_DAMARIS if (g_st_damaris_enabled) { st_damaris_expose_data(me, gvt, GVT_COL); st_damaris_end_iteration(); } else st_collect_engine_data(me, GVT_COL); #else st_collect_engine_data(me, GVT_COL); #endif } #ifdef USE_DAMARIS // need to make sure damaris_end_iteration is called if GVT instrumentation not turned on //if (!g_st_stats_enabled && g_st_real_time_samp) //need to make sure if one PE enters this, all do; otherwise deadlock if (g_st_damaris_enabled && (g_st_engine_stats == RT_STATS || g_st_engine_stats == VT_STATS)) { st_damaris_end_iteration(); } #endif if ((g_st_model_stats == GVT_STATS || g_st_model_stats == ALL_STATS) && g_tw_gvt_done % g_st_num_gvt == 0) st_collect_model_data(me, (tw_stime)tw_clock_read() / g_tw_clock_rate, GVT_STATS); st_inst_dump(); // done with instrumentation related stuff g_tw_gvt_done++; // reset for the next gvt round -- for use in realtime GVT mode only!! g_tw_gvt_interval_start_cycles = tw_clock_read(); }