void dmtcp::ProcessInfo::restart() { JASSERT(mprotect((void*)_restoreBufAddr, _restoreBufLen, PROT_NONE) == 0) ((void*)_restoreBufAddr) (_restoreBufLen) (JASSERT_ERRNO); restoreHeap(); // Update the ckptDir string ckptDir = jalib::Filesystem::GetDeviceName(PROTECTED_CKPT_DIR_FD); JASSERT(ckptDir.length() > 0); _real_close(PROTECTED_CKPT_DIR_FD); updateCkptDirFileSubdir(ckptDir); if (_launchCWD != _ckptCWD) { dmtcp::string rpath = ""; size_t llen = _launchCWD.length(); if (Util::strStartsWith(_ckptCWD.c_str(), _launchCWD.c_str()) && _ckptCWD[llen] == '/') { // _launchCWD = "/A/B"; _ckptCWD = "/A/B/C" -> rpath = "./c" rpath = "./" + _ckptCWD.substr(llen + 1); if (chdir(rpath.c_str()) == 0) { JTRACE("Changed cwd") (_launchCWD) (_ckptCWD) (_launchCWD + rpath); } else { JWARNING(chdir(_ckptCWD.c_str()) == 0) (_ckptCWD) (_launchCWD) (JASSERT_ERRNO) .Text("Failed to change directory to _ckptCWD"); } } } }
void petabricks::IterationDefinition::genScratchRegionLoopBegin(CodeGenerator& o){ if(isSingleCall()){ genLoopBegin(o); }else{ o.comment("Iterate along all the directions"); // Compute size for(size_t i=0; i<_size.size(); ++i){ o.write("int " + _size[i]->toString() + " = " + _end[i]->toString() + " - " + _begin[i]->toString() + ";"); } for(size_t i=0; i<_var.size(); ++i){ FormulaPtr b= new FormulaLiteral<int>(0); FormulaPtr e=_size[i]; FormulaPtr s=_step[i]; FormulaPtr v=_var[i]; //TODO: expand to reorder dimensions if(_order.canIterateForward(i) || !_order.canIterateBackward(i)){ JWARNING(_order.canIterateForward(i))(_order).Text("couldn't find valid iteration order, assuming forward"); o.beginFor(v->toString(), b, e, s); } else { o.beginReverseFor(v->toString(), b, e, s); } } } }
int sendKeyValPairToCoordinator(const char *id, const void *key, uint32_t key_len, const void *val, uint32_t val_len) { DmtcpMessage msg(DMT_REGISTER_NAME_SERVICE_DATA); JWARNING(strlen(id) < sizeof(msg.nsid)); strncpy(msg.nsid, id, sizeof msg.nsid); msg.keyLen = key_len; msg.valLen = val_len; msg.extraBytes = key_len + val_len; int sock = coordinatorSocket; if (dmtcp_is_running_state()) { if (nsSock == -1) { nsSock = createNewSocketToCoordinator(COORD_ANY); JASSERT(nsSock != -1); nsSock = Util::changeFd(nsSock, PROTECTED_NS_FD); sock = nsSock; DmtcpMessage m(DMT_NAME_SERVICE_WORKER); JASSERT(Util::writeAll(sock, &m, sizeof(m)) == sizeof(m)); } sock = nsSock; } JASSERT(Util::writeAll(sock, &msg, sizeof(msg)) == sizeof(msg)); JASSERT(Util::writeAll(sock, key, key_len) == key_len); JASSERT(Util::writeAll(sock, val, val_len) == val_len); return 1; }
// FIXME: Handle Virtual Pids static void restore_term_settings() { if (saved_termios_exists) { /* First check if we are in foreground. If not, skip this and print * warning. If we try to call tcsetattr in background, we will hang up. */ int foreground = (tcgetpgrp(STDIN_FILENO) == getpgrp()); JTRACE("restore terminal attributes, check foreground status first") (foreground); if (foreground) { if ((!isatty(STDIN_FILENO) || safe_tcsetattr(STDIN_FILENO, TCSANOW, &saved_termios) == -1)) { JWARNING(false).Text("failed to restore terminal"); } else { struct winsize cur_win; JTRACE("restored terminal"); ioctl(STDIN_FILENO, TIOCGWINSZ, (char *)&cur_win); /* ws_row/ws_col was probably not 0/0 prior to checkpoint. We change * it back to last known row/col prior to checkpoint, and then send a * SIGWINCH (see below) to notify process that window might have changed */ if (cur_win.ws_row == 0 && cur_win.ws_col == 0) { ioctl(STDIN_FILENO, TIOCSWINSZ, (char *)&win); } } } else { JWARNING(false) .Text(":skip restore terminal step -- we are in BACKGROUND"); } } /* * NOTE: * Apache, when running in debug mode (-X), uses SIGWINCH * as a signal for stopping gracefully. Please comment out * the next line to prevent DMTCP from sending a SIGWINCH * on restart when testing with Apache. * * TODO: * This should be done automatically by wrapping it in an ifdef * or if condition that disables the SIGWINCH using configure or * a runtime option (--no-sigwinch). */ if (kill(getpid(), SIGWINCH) == -1) {} /* No remedy if error */ }
int sendQueryAllToCoordinator(const char *id, void **buf, int *len) { DmtcpMessage msg(DMT_NAME_SERVICE_QUERY_ALL); JWARNING(strlen(id) < sizeof(msg.nsid)); strncpy(msg.nsid, id, sizeof msg.nsid); int sock = coordinatorSocket; if (dmtcp_is_running_state()) { if (nsSock == -1) { nsSock = createNewSocketToCoordinator(COORD_ANY); JASSERT(nsSock != -1); nsSock = Util::changeFd(nsSock, PROTECTED_NS_FD); JASSERT(nsSock == PROTECTED_NS_FD); DmtcpMessage m(DMT_NAME_SERVICE_WORKER); JASSERT(Util::writeAll(nsSock, &m, sizeof(m)) == sizeof(m)); } sock = nsSock; } JASSERT(Util::writeAll(sock, &msg, sizeof(msg)) == sizeof(msg)); msg.poison(); JASSERT(Util::readAll(sock, &msg, sizeof(msg)) == sizeof(msg)); msg.assertValid(); JASSERT(msg.type == DMT_NAME_SERVICE_QUERY_ALL_RESPONSE && msg.extraBytes == msg.valLen); /* * We can't assume anything about the size of the user-specified buffer, * so we read in in a safe, temporary buffer. This way there's no stale * data on the socket for the next reader. */ void *tmp = JALLOC_HELPER_MALLOC(msg.extraBytes); JASSERT (Util::readAll(sock, tmp, msg.extraBytes) == msg.extraBytes); if (*len > 0) { if ((size_t)*len < msg.extraBytes) { JALLOC_HELPER_FREE(tmp); errno = ERANGE; return -1; } else { memcpy(*buf, tmp, msg.extraBytes); *len = msg.extraBytes; JALLOC_HELPER_FREE(tmp); return 0; } } else if (*len == 0) { // Caller must free this buffer *buf = tmp; *len = msg.extraBytes; return 0; } JALLOC_HELPER_FREE(tmp); errno = EINVAL; return -1; }
static void timeout_handler(int sig, siginfo_t *si, void *uc) { JWARNING(false).Text("Checkpoint took longer than expected."); fflush(stdout); if (g_action == PRINT_WARNING_AND_EXIT) { JASSERT(false)("Killing the application."); } signal(sig, SIG_IGN); }
// shutdownMtcpEngineOnFork will dlclose the old libmtcp.so and will // dlopen a new libmtcp.so. DmtcpWorker constructor then calls // initializeMtcpEngine, which will then call mtcp_init. We must close // the old SIG_CKPT handler prior to this, so that MTCP and mtcp_init() // don't think someone else is using their SIG_CKPT signal. void dmtcp::shutdownMtcpEngineOnFork() { // Remove our signal handler from our SIG_CKPT errno = 0; JWARNING (SIG_ERR != _real_signal(dmtcp::DmtcpWorker::determineMtcpSignal(), SIG_DFL)) (dmtcp::DmtcpWorker::determineMtcpSignal()) (JASSERT_ERRNO) .Text("failed to reset child's checkpoint signal on fork"); get_mtcp_symbol ( REOPEN_MTCP ); }
void GpuManager::shutdown() { if(!_useOpenCL()) return; if(_shutdown) return; _shutdown = true; int rv = pthread_join(_thread, NULL); JWARNING(rv==0)(rv).Text("pthread_join failed"); OpenCLUtil::deinit(); #ifdef GPU_TRACE std::cout << "pthead_join~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" << std::endl; #endif }
void dmtcp::ProcessInfo::insertChild(pid_t pid, dmtcp::UniquePid uniquePid) { _do_lock_tbl(); iterator i = _childTable.find(pid); JWARNING(i == _childTable.end()) (pid) (uniquePid) (i->second) .Text("child pid already exists!"); _childTable[pid] = uniquePid; _do_unlock_tbl(); JTRACE("Creating new virtualPid -> realPid mapping.") (pid) (uniquePid); }
int getUniqueIdFromCoordinator(const char *id, const void *key, uint32_t key_len, void *val, uint32_t *val_len, uint32_t offset /* = 1 */) { DmtcpMessage msg(DMT_NAME_SERVICE_GET_UNIQUE_ID); JWARNING(strlen(id) < sizeof(msg.nsid)); strncpy(msg.nsid, id, sizeof msg.nsid); msg.keyLen = key_len; msg.valLen = 0; msg.extraBytes = key_len; msg.uniqueIdOffset = offset; msg.valLen = *val_len; int sock = coordinatorSocket; if (key == NULL || key_len == 0 || val == NULL || val_len == 0) { return 0; } if (dmtcp_is_running_state()) { if (nsSock == -1) { nsSock = createNewSocketToCoordinator(COORD_ANY); JASSERT(nsSock != -1); nsSock = Util::changeFd(nsSock, PROTECTED_NS_FD); JASSERT(nsSock == PROTECTED_NS_FD); DmtcpMessage m(DMT_NAME_SERVICE_WORKER); JASSERT(Util::writeAll(nsSock, &m, sizeof(m)) == sizeof(m)); } sock = nsSock; } JASSERT(Util::writeAll(sock, &msg, sizeof(msg)) == sizeof(msg)); JASSERT(Util::writeAll(sock, key, key_len) == key_len); msg.poison(); JASSERT(Util::readAll(sock, &msg, sizeof(msg)) == sizeof(msg)); msg.assertValid(); JASSERT(msg.type == DMT_NAME_SERVICE_GET_UNIQUE_ID_RESPONSE && msg.extraBytes == msg.valLen); JASSERT(*val_len >= msg.valLen); *val_len = msg.valLen; JASSERT(Util::readAll(sock, val, *val_len) == *val_len); return *val_len; }
void dmtcp::ConnectionState::postCheckpoint(bool isRestart) { _drain.refillAllSockets(); ConnectionList& connections = ConnectionList::instance(); ConnectionList::iterator i; for (i= connections.begin(); i != connections.end(); ++i) { if (_conToFds[i->first].size() <= 0) { JWARNING(false) (i->first.conId()) .Text("WARNING:: stale connections should be gone by now"); } if (_conToFds[i->first].size() == 0) continue; (i->second)->postCheckpoint(_conToFds[i->first], isRestart); } }
void dmtcp::ProcessInfo::restoreProcessGroupInfo() { // Restore group assignment if (dmtcp_virtual_to_real_pid && dmtcp_virtual_to_real_pid(_gid) != _gid) { pid_t cgid = getpgid(0); // Group ID is known inside checkpointed processes if (_gid != cgid) { JTRACE("Restore Group Assignment") (_gid) (_fgid) (cgid) (_pid) (_ppid) (getppid()); JWARNING(setpgid(0, _gid) == 0) (_gid) (JASSERT_ERRNO) .Text("Cannot change group information"); } else { JTRACE("Group is already assigned") (_gid) (cgid); } } else { JTRACE("SKIP Group information, GID unknown"); } }
static bool initialize_and_start_perf_attr(struct perf_event_attr *pes, int i, __u32 type, __u64 config) { JASSERT(pes); pes->type = type; pes->size = sizeof(struct perf_event_attr); pes->config = config; pes->disabled = 1; pes->exclude_kernel = 1; pes->exclude_hv = 1; fd[i] = perf_event_open1(pes, 0, -1, -1, 0); if (fd[i] < 0) { JWARNING(false)("Error opening leader\n")(pes->config); ioctl(fd[i], PERF_EVENT_IOC_DISABLE, 0); return false; } ioctl(fd[i], PERF_EVENT_IOC_RESET, 0); ioctl(fd[i], PERF_EVENT_IOC_ENABLE, 0); return true; }
void dmtcp::ProcessInfo::restoreProcessGroupInfo() { // FIXME: This needs to be fixed #if 0 #ifdef PID_VIRTUALIZATION // Restore group assignment if( VirtualPidTable::instance().pidExists(_gid) ){ pid_t cgid = getpgid(0); // Group ID is known inside checkpointed processes if( _gid != cgid && _pid != _gid ){ JTRACE("Restore Group Assignment") ( _gid ) ( _fgid ) ( cgid ) ( _pid ) ( _ppid ) ( getppid() ); JWARNING( setpgid(0,_gid) == 0 ) (_gid) (JASSERT_ERRNO) .Text("Cannot change group information"); }else{ JTRACE("Group is already assigned")(_gid)(cgid); } }else{ JTRACE("SKIP Group information, GID unknown"); } #endif #endif }
static int ptsname_r_work(int fd, char * buf, size_t buflen) { JTRACE("Calling ptsname_r"); dmtcp::Connection* c = &dmtcp::KernelDeviceToConnection::instance().retrieve(fd); dmtcp::PtyConnection* ptyCon =(dmtcp::PtyConnection*) c; dmtcp::string uniquePtsName = ptyCon->uniquePtsName(); JTRACE("ptsname_r") (uniquePtsName); if (uniquePtsName.length() >= buflen) { JWARNING(false) (uniquePtsName) (uniquePtsName.length()) (buflen) .Text("fake ptsname() too long for user buffer"); errno = ERANGE; return -1; } strcpy(buf, uniquePtsName.c_str()); return 0; }
void petabricks::IterationDefinition::genLoopBegin(CodeGenerator& o){ if(isSingleCall()){ o.write("{"); for(size_t i=0; i<_var.size(); ++i){ o.varDecl("const IndexT "+_var[i]->toString()+" = "+_begin[i]->toString()); } }else{ o.comment("Iterate along all the directions"); for(size_t i=0; i<_var.size(); ++i){ FormulaPtr b=_begin[i]; FormulaPtr e=_end[i]; FormulaPtr s=_step[i]; FormulaPtr v=_var[i]; //TODO: expand to reorder dimensions if(_order.canIterateForward(i) || !_order.canIterateBackward(i)){ JWARNING(_order.canIterateForward(i))(_order).Text("couldn't find valid iteration order, assuming forward"); o.beginFor(v->toString(), b, e, s); } else { o.beginReverseFor(v->toString(), b, e, s); } } } }
void dmtcp::ConnectionState::postRestart() { ConnectionList& connections = ConnectionList::instance(); // Two part restoreOptions. See the comments in doReconnect() // Part 1: Restore options for all but Pseudo-terminal slaves ConnectionList::iterator i; for (i= connections.begin(); i != connections.end(); ++i) { JWARNING(_conToFds[i->first].size() > 0) .Text("stale connections should be gone by now"); if (_conToFds[i->first].size() == 0) continue; if ((i->second)->conType() == Connection::PTY && (((PtyConnection*) (i->second))->ptyType() == PtyConnection::PTY_SLAVE || ((PtyConnection*) (i->second))->ptyType() == PtyConnection::PTY_BSD_SLAVE)) { } else { (i->second)->restoreOptions(_conToFds[i->first]); } } // Part 2: Restore options for all Pseudo-terminal slaves for (i= connections.begin(); i != connections.end(); ++i) { if (_conToFds[i->first].size() == 0) continue; if ((i->second)->conType() == Connection::PTY && (((PtyConnection*) (i->second))->ptyType() == PtyConnection::PTY_SLAVE || ((PtyConnection*) (i->second))->ptyType() == PtyConnection::PTY_BSD_SLAVE)) { (i->second)->restoreOptions(_conToFds[i->first]); } } KernelDeviceToConnection::instance().dbgSpamFds(); //fix our device table to match the new world order KernelDeviceToConnection::instance() = KernelDeviceToConnection(_conToFds); }
void startCtrsSignalHandler(int sig, siginfo_t *si, void *uc) { JWARNING(setup_perf_ctr()).Text("Error setting up perf ctrs."); }
void dmtcp_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data) { static char *filename = NULL; static bool restartingFromCkpt = false; static FILE *outfp = NULL; switch (event) { case DMTCP_EVENT_INIT: { if (!getenv("DMTCP_START_CTRS_ON_RESTART_STRATEGY")) { setup_handlers(); filename = getStatsFilename(getenv("STATFILE")); JWARNING(filename != NULL).Text("Could not get the stats filename in the init event."); JTRACE("Filename: ")(filename); } } break; case DMTCP_EVENT_WRITE_CKPT: { JTRACE("CHKP"); if (getenv("DMTCP_START_CTRS_ON_RESTART_STRATEGY")) { filename = getenv("STATFILE"); if (restartingFromCkpt) { JTRACE("WRITE CHKP"); JASSERT(filename); outfp = fopen(filename, "w+"); if (!outfp) { perror("Error opening stats file in w+ mode"); JASSERT(false); } read_ctrs(outfp); fclose(outfp); restartingFromCkpt = false; } } } break; case DMTCP_EVENT_RESUME: { if (getenv("DMTCP_KILL_ON_RESUME_STRATEGY")) { exit(0); } } break; case DMTCP_EVENT_RESTART: { if (getenv("DMTCP_START_CTRS_ON_RESTART_STRATEGY")) { restartingFromCkpt = true; filename = getStatsFilename(getenv("STATFILE")); JWARNING(filename != NULL).Text("Could not get the stats filename in the restart event."); JTRACE("Filename: ")(filename); JWARNING(setup_perf_ctr()).Text("Error setting up perf ctrs."); } } break; case DMTCP_EVENT_RESUME_USER_THREAD: { if (getenv("DMTCP_START_CTRS_ON_RESTART_STRATEGY")) { filename = getStatsFilename(getenv("STATFILE")); JWARNING(filename != NULL).Text("Could not get the stats filename in the resume_user_thread event."); JTRACE("Filename: ")(filename); } } break; default: break; } DMTCP_NEXT_EVENT_HOOK(event, data); }
string writeScript(const string& ckptDir, bool uniqueCkptFilenames, const time_t& ckptTimeStamp, const uint32_t theCheckpointInterval, const int thePort, const UniquePid& compId, const map<string, vector<string> >& restartFilenames) { ostringstream o; string uniqueFilename; o << string(ckptDir) << "/" << RESTART_SCRIPT_BASENAME << "_" << compId; if (uniqueCkptFilenames) { o << "_" << std::setw(5) << std::setfill('0') << compId.computationGeneration(); } o << "." << RESTART_SCRIPT_EXT; uniqueFilename = o.str(); const bool isSingleHost = (restartFilenames.size() == 1); map< string, vector<string> >::const_iterator host; size_t numPeers; for (host = restartFilenames.begin(); host != restartFilenames.end(); host++) { numPeers += host->second.size(); } vector<string>::const_iterator file; char hostname[80]; char timestamp[80]; gethostname ( hostname, 80 ); JTRACE ( "writing restart script" ) ( uniqueFilename ); FILE* fp = fopen ( uniqueFilename.c_str(),"w" ); JASSERT ( fp!=0 )(JASSERT_ERRNO)( uniqueFilename ) .Text ( "failed to open file" ); fprintf ( fp, "%s", header ); fprintf ( fp, "%s", checkLocal ); fprintf ( fp, "%s", slurmHelperContactFunction ); fprintf ( fp, "%s", usage ); ctime_r(&ckptTimeStamp, timestamp); // Remove the trailing '\n' timestamp[strlen(timestamp) - 1] = '\0'; fprintf ( fp, "ckpt_timestamp=\"%s\"\n\n", timestamp ); fprintf ( fp, "coord_host=$" ENV_VAR_NAME_HOST "\n" "if test -z \"$" ENV_VAR_NAME_HOST "\"; then\n" " coord_host=%s\nfi\n\n" "coord_port=$" ENV_VAR_NAME_PORT "\n" "if test -z \"$" ENV_VAR_NAME_PORT "\"; then\n" " coord_port=%d\nfi\n\n" "checkpoint_interval=$" ENV_VAR_CKPT_INTR "\n" "if test -z \"$" ENV_VAR_CKPT_INTR "\"; then\n" " checkpoint_interval=%d\nfi\n" "export DMTCP_CHECKPOINT_INTERVAL=${checkpoint_interval}\n\n", hostname, thePort, theCheckpointInterval ); fprintf ( fp, "%s", cmdlineArgHandler ); fprintf ( fp, "dmt_rstr_cmd=%s/" DMTCP_RESTART_CMD "\n" "which $dmt_rstr_cmd > /dev/null 2>&1" " || dmt_rstr_cmd=" DMTCP_RESTART_CMD "\n" "which $dmt_rstr_cmd > /dev/null 2>&1" " || echo \"$0: $dmt_rstr_cmd not found\"\n" "which $dmt_rstr_cmd > /dev/null 2>&1 || exit 1\n\n", jalib::Filesystem::GetProgramDir().c_str()); fprintf ( fp, "# Number of hosts in the computation = %zu\n" "# Number of processes in the computation = %zu\n\n", restartFilenames.size(), numPeers ); if ( isSingleHost ) { JTRACE ( "Single HOST" ); host=restartFilenames.begin(); ostringstream o; for ( file=host->second.begin(); file!=host->second.end(); ++file ) { o << " " << *file; } fprintf ( fp, "given_ckpt_files=\"%s\"\n\n", o.str().c_str()); fprintf ( fp, "%s", singleHostProcessing ); } else { fprintf ( fp, "%s", "# SYNTAX:\n" "# :: <HOST> :<MODE>: <CHECKPOINT_IMAGE> ...\n" "# Host names and filenames must not include \':\'\n" "# At most one fg (foreground) mode allowed; it must be last.\n" "# \'maybexterm\' and \'maybebg\' are set from <MODE>.\n"); fprintf ( fp, "%s", "worker_ckpts=\'" ); for ( host=restartFilenames.begin(); host!=restartFilenames.end(); ++host ) { fprintf ( fp, "\n :: %s :bg:", host->first.c_str() ); for ( file=host->second.begin(); file!=host->second.end(); ++file ) { fprintf ( fp," %s", file->c_str() ); } } fprintf ( fp, "%s", "\n\'\n\n" ); fprintf( fp, "# Check for resource manager\n" "ibrun_path=$(which ibrun 2> /dev/null)\n" "if [ ! -n \"$ibrun_path\" ]; then\n" " discover_rm_path=$(which dmtcp_discover_rm)\n" " if [ -n \"$discover_rm_path\" ]; then\n" " eval $(dmtcp_discover_rm -t)\n" " srun_path=$(which srun 2> /dev/null)\n" " llaunch=`which dmtcp_rm_loclaunch`\n" " if [ $RES_MANAGER = \"SLURM\" ] && [ -n \"$srun_path\" ]; then\n" " eval $(dmtcp_discover_rm -n \"$worker_ckpts\")\n" " if [ -n \"$DMTCP_DISCOVER_RM_ERROR\" ]; then\n" " echo \"Restart error: $DMTCP_DISCOVER_RM_ERROR\"\n" " echo \"Allocated resources: $manager_resources\"\n" " exit 0\n" " fi\n" " export DMTCP_REMLAUNCH_NODES=$DMTCP_REMLAUNCH_NODES\n" " bound=$(($DMTCP_REMLAUNCH_NODES - 1))\n" " for i in $(seq 0 $bound); do\n" " eval \"val=\\${DMTCP_REMLAUNCH_${i}_SLOTS}\"\n" " export DMTCP_REMLAUNCH_${i}_SLOTS=\"$val\"\n" " bound2=$(($val - 1))\n" " for j in $(seq 0 $bound2); do\n" " eval \"ckpts=\\${DMTCP_REMLAUNCH_${i}_${j}}\"\n" " export DMTCP_REMLAUNCH_${i}_${j}=\"$ckpts\"\n" " done\n" " done\n" " if [ \"$DMTCP_DISCOVER_PM_TYPE\" = \"HYDRA\" ]; then\n" " export DMTCP_SRUN_HELPER_SYNCFILE=`mktemp ./tmp.XXXXXXXXXX`\n" " rm $DMTCP_SRUN_HELPER_SYNCFILE\n" " dmtcp_srun_helper -r $srun_path \"$llaunch\"\n" " if [ ! -f $DMTCP_SRUN_HELPER_SYNCFILE ]; then\n" " echo \"Error launching application\"\n" " exit 1\n" " fi\n" " # export helper contact info\n" " . $DMTCP_SRUN_HELPER_SYNCFILE\n" " pass_slurm_helper_contact \"$DMTCP_LAUNCH_CKPTS\"\n" " rm $DMTCP_SRUN_HELPER_SYNCFILE\n" " dmtcp_restart --join --coord-host $DMTCP_COORD_HOST" " --coord-port $DMTCP_COORD_PORT" " $DMTCP_LAUNCH_CKPTS\n" " else\n" " DMTCP_REMLAUNCH_0_0=\"$DMTCP_REMLAUNCH_0_0" " $DMTCP_LAUNCH_CKPTS\"\n" " $srun_path \"$llaunch\"\n" " fi\n" " exit 0\n" " elif [ $RES_MANAGER = \"TORQUE\" ]; then\n" " #eval $(dmtcp_discover_rm \"$worker_ckpts\")\n" " #if [ -n \"$new_worker_ckpts\" ]; then\n" " # worker_ckpts=\"$new_worker_ckpts\"\n" " #fi\n" " eval $(dmtcp_discover_rm -n \"$worker_ckpts\")\n" " if [ -n \"$DMTCP_DISCOVER_RM_ERROR\" ]; then\n" " echo \"Restart error: $DMTCP_DISCOVER_RM_ERROR\"\n" " echo \"Allocated resources: $manager_resources\"\n" " exit 0\n" " fi\n" " arguments=\"PATH=$PATH DMTCP_COORD_HOST=$DMTCP_COORD_HOST" " DMTCP_COORD_PORT=$DMTCP_COORD_PORT\"\n" " arguments=$arguments\" DMTCP_CHECKPOINT_INTERVAL=$DMTCP_CHECKPOINT_INTERVAL\"\n" " arguments=$arguments\" DMTCP_TMPDIR=$DMTCP_TMPDIR\"\n" " arguments=$arguments\" DMTCP_REMLAUNCH_NODES=$DMTCP_REMLAUNCH_NODES\"\n" " bound=$(($DMTCP_REMLAUNCH_NODES - 1))\n" " for i in $(seq 0 $bound); do\n" " eval \"val=\\${DMTCP_REMLAUNCH_${i}_SLOTS}\"\n" " arguments=$arguments\" DMTCP_REMLAUNCH_${i}_SLOTS=\\\"$val\\\"\"\n" " bound2=$(($val - 1))\n" " for j in $(seq 0 $bound2); do\n" " eval \"ckpts=\\${DMTCP_REMLAUNCH_${i}_${j}}\"\n" " arguments=$arguments\" DMTCP_REMLAUNCH_${i}_${j}=\\\"$ckpts\\\"\"\n" " done\n" " done\n" " pbsdsh -u \"$llaunch\" \"$arguments\"\n" " exit 0\n" " fi\n" " fi\n" "fi\n" "\n\n" ); fprintf ( fp, "%s", multiHostProcessing ); } fclose ( fp ); { string filename = RESTART_SCRIPT_BASENAME "." RESTART_SCRIPT_EXT; string dirname = jalib::Filesystem::DirName(uniqueFilename); int dirfd = open(dirname.c_str(), O_DIRECTORY | O_RDONLY); JASSERT(dirfd != -1) (dirname) (JASSERT_ERRNO); /* Set execute permission for user. */ struct stat buf; JASSERT(::stat(uniqueFilename.c_str(), &buf) == 0); JASSERT(chmod(uniqueFilename.c_str(), buf.st_mode | S_IXUSR) == 0); // Create a symlink from // dmtcp_restart_script.sh -> dmtcp_restart_script_<curCompId>.sh unlink(filename.c_str()); JTRACE("linking \"dmtcp_restart_script.sh\" filename to uniqueFilename") (filename) (dirname) (uniqueFilename); // FIXME: Handle error case of symlink() JWARNING(symlinkat(basename(uniqueFilename.c_str()), dirfd, filename.c_str()) == 0) (JASSERT_ERRNO); JASSERT(close(dirfd) == 0); } return uniqueFilename; }