TEST_F(ExecutiveTest , Moding) { //req.add_requirement("411993330 2253353824 2874482372"); exec.freeze() ; exec.async_freeze_to_exec_command() ; EXPECT_EQ(exec_get_exec_command() , FreezeCmd) ; exec.run() ; EXPECT_EQ(exec_get_exec_command() , RunCmd) ; exec.stop() ; EXPECT_EQ(exec_get_exec_command() , ExitCmd) ; exec.stop(123.0) ; EXPECT_NEAR(exec_get_terminate_time() , 123.0 , 0.000001) ; }
int Trick::SlaveInfo::write_master_status() { /** @par Detailed Design: */ /** @li If the slave is an active synchronization partner (activated == true) */ /** @li and we are not currently waiting for slave to reconnect, */ if (( activated == true ) && (reconnect_count == 0)) { /** @li write the current time according to the master to the slave */ connection->write_time(exec_get_time_tics()) ; /** @li write the current exec_command according to the master to the slave */ connection->write_command((MS_SIM_COMMAND)exec_get_exec_command()) ; } if ((MS_SIM_COMMAND)exec_get_exec_command() == MS_ChkpntLoadBinCmd) { // dmtcp slave will exit, so stop writing status to slave until it reconnects // reconnect_count prevents us from writing status to slave, & is incremented every freeze cycle until we have reconnected reconnect_count = 1; } return(0) ; }
int Trick::Master::checkpoint() { /** @par Detailed Design: */ /** @li If chkpnt_dump_auto, tell slave to dump a checkpoint */ unsigned int ii ; // do not tell slave to dump if this is a pre_init, post_init, or end checkpoint // those are handled with flags sent to slave in init() if ((exec_get_mode() == Initialization) || (exec_get_mode() == ExitMode)) { return(0); } if (enabled) { // Use 2 loops to read all slave status before writing any status out. for ( ii = 0 ; ii < slaves.size() ; ii++ ) { slaves[ii]->read_slave_status() ; } SIM_COMMAND save_command = exec_get_exec_command() ; std::string full_path_name = checkpoint_get_output_file(); for ( ii = 0 ; ii < slaves.size() ; ii++ ) { if (slaves[ii]->chkpnt_dump_auto) { if (slaves[ii]->chkpnt_binary) { if (slaves[ii]->slave_type == "dmtcp") { exec_set_exec_command((SIM_COMMAND)MS_ChkpntDumpBinCmd) ; slaves[ii]->write_master_status() ; slaves[ii]->write_master_chkpnt_name(full_path_name) ; exec_set_exec_command(save_command) ; } else { message_publish(MSG_ERROR, "Slave is not running under dmtcp control so it cannot dump binary checkpoint.\n") ; slaves[ii]->write_master_status() ; } } else { // ascii exec_set_exec_command((SIM_COMMAND)MS_ChkpntDumpAsciiCmd) ; slaves[ii]->write_master_status() ; slaves[ii]->write_master_chkpnt_name(full_path_name) ; exec_set_exec_command(save_command) ; } } else { // no auto dump slaves[ii]->write_master_status() ; } } } return(0) ; }
int Trick::Master::preload_checkpoint() { /** @par Detailed Design: */ /** @li If chkpnt_load_auto, tell slave to load a checkpoint */ unsigned int ii ; if (enabled) { // Use 2 loops to read all slave status before writing any status out. for ( ii = 0 ; ii < slaves.size() ; ii++ ) { slaves[ii]->read_slave_status() ; } SIM_COMMAND save_command = exec_get_exec_command() ; std::string full_path_name = checkpoint_get_load_file(); for ( ii = 0 ; ii < slaves.size() ; ii++ ) { if (slaves[ii]->chkpnt_load_auto) { if (slaves[ii]->chkpnt_binary) { if (slaves[ii]->slave_type == "dmtcp") { exec_set_exec_command((SIM_COMMAND)MS_ChkpntLoadBinCmd) ; slaves[ii]->write_master_status() ; slaves[ii]->write_master_chkpnt_name(full_path_name) ; exec_set_exec_command(save_command) ; } else { message_publish(MSG_ERROR, "Slave is not running under dmtcp control so it cannot load binary checkpoint.\n") ; slaves[ii]->write_master_status() ; } } else { // ascii exec_set_exec_command((SIM_COMMAND)MS_ChkpntLoadAsciiCmd) ; slaves[ii]->write_master_status() ; slaves[ii]->write_master_chkpnt_name(full_path_name) ; exec_set_exec_command(save_command) ; } } else { // no auto load slaves[ii]->write_master_status() ; } } } return(0) ; }
int Trick::SlaveInfo::read_slave_status() { MS_SIM_COMMAND slave_command ; MS_SIM_COMMAND exec_command ; /** @par Detailed Design: */ /** @li If the slave is an active synchronization partner (activated == true) */ if (activated == true) { /** @li read the current slave exec_command */ slave_command = connection->read_command() ; //printf("DEBUG master read %d command from slave\n", slave_command);fflush(stdout); exec_command = (MS_SIM_COMMAND)exec_get_exec_command() ; // fixup: is it possible we won't get slave's Exit command over socket when it terminates?, set it here if that happens if (dynamic_cast<MSSocket*>(connection)) { if ((slave_command == MS_ErrorCmd) && (reconnect_wait_limit > 0.0) && (reconnect_count == 0)) { slave_command = MS_ExitCmd; } } /** @li If the master is not currently exiting, change modes if the slave is freezing/exiting or has an error */ if ( exec_command != MS_ExitCmd ) { switch ( slave_command ) { case (MS_ErrorCmd): /** @li if the user has set a reconnect_wait_limit, continue on if we are still under that limit, otherwise if the current slave mode cannot be read, exit the master if sync_error_terminate == true, otherwise set the activated flag to false */ if ( (reconnect_count * exec_get_freeze_frame()) < reconnect_wait_limit) { reconnect_count++; } else if (sync_error_terminate == true) { message_publish(MSG_ERROR, "Master lost sync with slave, so master is terminating.\n") ; exec_terminate_with_return(-1, __FILE__, __LINE__ , "Master lost sync with slave.") ; } else { message_publish(MSG_ERROR, "Master lost sync with slave, so slave is being deactivated.\n") ; activated = false ; return(0) ; } break ; case (MS_ExitCmd): /** @li if the current slave mode is exiting, exit the master if sync_error_terminate == true. otherwise wait for slave to reconnect. when wait limit is 0, set the activated flag to false */ if (sync_error_terminate == true){ message_publish(MSG_WARNING, "sync_error_terminate is true: Slave is exiting, so master is terminating.\n") ; exec_terminate_with_return(-1, __FILE__, __LINE__ , "Slave is exiting, so is the master.") ; } else { message_publish(MSG_WARNING, "Slave is exiting.\n") ; // if reconnect_wait_limit is set, master waits for slave to reconnect (e.g. dmtcp restarting) if (reconnect_wait_limit > 0.0) { message_publish(MSG_WARNING, "Master will wait %f seconds for slave to reconnect.\n", reconnect_wait_limit) ; // make reads (shared mem connection) return quickly so we don't overrun waiting for reconnect // TODO: for socket connection we will overrun in the accept call (see restart_dmtcp_slave) connection->set_sync_wait_limit(exec_get_freeze_frame()); if (chkpnt_binary) { restart_dmtcp_slave(); // restart the slave dmtcp executable } } else { message_publish(MSG_WARNING, "reconnect_wait_limit: 0.0 - Master will stop communicating with slave.\n") ; activated = false ; } return(0) ; } break ; case (MS_ChkpntLoadBinCmd): // slave has received our load command and is now sending us his dmtcp port and checkpoint file name dmtcp_port = connection->read_port() ; connection->read_name(chkpnt_name, sizeof(chkpnt_name)); // dir/filename message_publish(MSG_WARNING , "Master received DMTCP Port and Checkpoint Filename from slave.\n"); connection->write_command((MS_SIM_COMMAND)exec_get_exec_command()) ; // send this as an ack so slove can shut down break ; case (MS_FreezeCmd): /** @li if the current slave is freezing, freeze the master too */ message_publish(MSG_INFO, "Slave is freezing.\n") ; exec_set_exec_command(FreezeCmd) ; reconnect_count = 0; break ; case (MS_ReconnectCmd): // set the sync wait limit back to its default connection->set_sync_wait_limit(sync_wait_limit); message_publish(MSG_INFO, "Master has reconnected to slave.\n") ; reconnect_count = 0; break ; default: break ; } } } return(0) ; }