/* * Setup a virtual coordinator. It's part of the running process (i.e., no * separate process is created). * * FIXME: This is the only place in this file where we use JSocket. May be get * rid of it here too? */ void setupVirtualCoordinator(CoordinatorInfo *coordInfo, struct in_addr *localIP) { string host = ""; int port; getCoordHostAndPort(COORD_NONE, host, &port); jalib::JSocket sock = jalib::JServerSocket(jalib::JSockAddr::ANY, port).sockfd(); JASSERT(sock.isValid()) (port) (JASSERT_ERRNO) .Text("Failed to create listen socket."); Util::changeFd(sock.sockfd(), PROTECTED_COORD_FD); JASSERT(Util::isValidFd(coordinatorSocket)); setCoordPort(sock.port()); pid_t ppid = getppid(); Util::setVirtualPidEnvVar(INITIAL_VIRTUAL_PID, ppid, ppid); UniquePid coordId = UniquePid(INITIAL_VIRTUAL_PID, UniquePid::ThisProcess().hostid(), UniquePid::ThisProcess().time()); coordInfo->id = coordId.upid(); coordInfo->timeStamp = coordId.time(); coordInfo->addrLen = 0; if (getenv(ENV_VAR_CKPT_INTR) != NULL) { coordInfo->interval = (uint32_t)strtol(getenv(ENV_VAR_CKPT_INTR), NULL, 0); } else { coordInfo->interval = 0; } memset(&coordInfo->addr, 0, sizeof(coordInfo->addr)); memset(localIP, 0, sizeof(*localIP)); }
EXTERNC const char* dmtcp_get_computation_id_str(void) { static string *compid_str = NULL; if (compid_str == NULL) { UniquePid compId = SharedData::getCompId(); compid_str = new string(compId.toString()); } return compid_str->c_str(); }
string writeScript(const string& ckptDir, bool uniqueCkptFilenames, const time_t& ckptTimeStamp, const uint32_t theCheckpointInterval, const int thePort, const UniquePid& compId, const map<string, vector<string> >& restartFilenames) { ostringstream o; string uniqueFilename; o << string(ckptDir) << "/" << RESTART_SCRIPT_BASENAME << "_" << compId; if (uniqueCkptFilenames) { o << "_" << std::setw(5) << std::setfill('0') << compId.computationGeneration(); } o << "." << RESTART_SCRIPT_EXT; uniqueFilename = o.str(); const bool isSingleHost = (restartFilenames.size() == 1); map< string, vector<string> >::const_iterator host; size_t numPeers; for (host = restartFilenames.begin(); host != restartFilenames.end(); host++) { numPeers += host->second.size(); } vector<string>::const_iterator file; char hostname[80]; char timestamp[80]; gethostname ( hostname, 80 ); JTRACE ( "writing restart script" ) ( uniqueFilename ); FILE* fp = fopen ( uniqueFilename.c_str(),"w" ); JASSERT ( fp!=0 )(JASSERT_ERRNO)( uniqueFilename ) .Text ( "failed to open file" ); fprintf ( fp, "%s", header ); fprintf ( fp, "%s", checkLocal ); fprintf ( fp, "%s", slurmHelperContactFunction ); fprintf ( fp, "%s", usage ); ctime_r(&ckptTimeStamp, timestamp); // Remove the trailing '\n' timestamp[strlen(timestamp) - 1] = '\0'; fprintf ( fp, "ckpt_timestamp=\"%s\"\n\n", timestamp ); fprintf ( fp, "coord_host=$" ENV_VAR_NAME_HOST "\n" "if test -z \"$" ENV_VAR_NAME_HOST "\"; then\n" " coord_host=%s\nfi\n\n" "coord_port=$" ENV_VAR_NAME_PORT "\n" "if test -z \"$" ENV_VAR_NAME_PORT "\"; then\n" " coord_port=%d\nfi\n\n" "checkpoint_interval=$" ENV_VAR_CKPT_INTR "\n" "if test -z \"$" ENV_VAR_CKPT_INTR "\"; then\n" " checkpoint_interval=%d\nfi\n" "export DMTCP_CHECKPOINT_INTERVAL=${checkpoint_interval}\n\n", hostname, thePort, theCheckpointInterval ); fprintf ( fp, "%s", cmdlineArgHandler ); fprintf ( fp, "dmt_rstr_cmd=%s/" DMTCP_RESTART_CMD "\n" "which $dmt_rstr_cmd > /dev/null 2>&1" " || dmt_rstr_cmd=" DMTCP_RESTART_CMD "\n" "which $dmt_rstr_cmd > /dev/null 2>&1" " || echo \"$0: $dmt_rstr_cmd not found\"\n" "which $dmt_rstr_cmd > /dev/null 2>&1 || exit 1\n\n", jalib::Filesystem::GetProgramDir().c_str()); fprintf ( fp, "# Number of hosts in the computation = %zu\n" "# Number of processes in the computation = %zu\n\n", restartFilenames.size(), numPeers ); if ( isSingleHost ) { JTRACE ( "Single HOST" ); host=restartFilenames.begin(); ostringstream o; for ( file=host->second.begin(); file!=host->second.end(); ++file ) { o << " " << *file; } fprintf ( fp, "given_ckpt_files=\"%s\"\n\n", o.str().c_str()); fprintf ( fp, "%s", singleHostProcessing ); } else { fprintf ( fp, "%s", "# SYNTAX:\n" "# :: <HOST> :<MODE>: <CHECKPOINT_IMAGE> ...\n" "# Host names and filenames must not include \':\'\n" "# At most one fg (foreground) mode allowed; it must be last.\n" "# \'maybexterm\' and \'maybebg\' are set from <MODE>.\n"); fprintf ( fp, "%s", "worker_ckpts=\'" ); for ( host=restartFilenames.begin(); host!=restartFilenames.end(); ++host ) { fprintf ( fp, "\n :: %s :bg:", host->first.c_str() ); for ( file=host->second.begin(); file!=host->second.end(); ++file ) { fprintf ( fp," %s", file->c_str() ); } } fprintf ( fp, "%s", "\n\'\n\n" ); fprintf( fp, "# Check for resource manager\n" "ibrun_path=$(which ibrun 2> /dev/null)\n" "if [ ! -n \"$ibrun_path\" ]; then\n" " discover_rm_path=$(which dmtcp_discover_rm)\n" " if [ -n \"$discover_rm_path\" ]; then\n" " eval $(dmtcp_discover_rm -t)\n" " srun_path=$(which srun 2> /dev/null)\n" " llaunch=`which dmtcp_rm_loclaunch`\n" " if [ $RES_MANAGER = \"SLURM\" ] && [ -n \"$srun_path\" ]; then\n" " eval $(dmtcp_discover_rm -n \"$worker_ckpts\")\n" " if [ -n \"$DMTCP_DISCOVER_RM_ERROR\" ]; then\n" " echo \"Restart error: $DMTCP_DISCOVER_RM_ERROR\"\n" " echo \"Allocated resources: $manager_resources\"\n" " exit 0\n" " fi\n" " export DMTCP_REMLAUNCH_NODES=$DMTCP_REMLAUNCH_NODES\n" " bound=$(($DMTCP_REMLAUNCH_NODES - 1))\n" " for i in $(seq 0 $bound); do\n" " eval \"val=\\${DMTCP_REMLAUNCH_${i}_SLOTS}\"\n" " export DMTCP_REMLAUNCH_${i}_SLOTS=\"$val\"\n" " bound2=$(($val - 1))\n" " for j in $(seq 0 $bound2); do\n" " eval \"ckpts=\\${DMTCP_REMLAUNCH_${i}_${j}}\"\n" " export DMTCP_REMLAUNCH_${i}_${j}=\"$ckpts\"\n" " done\n" " done\n" " if [ \"$DMTCP_DISCOVER_PM_TYPE\" = \"HYDRA\" ]; then\n" " export DMTCP_SRUN_HELPER_SYNCFILE=`mktemp ./tmp.XXXXXXXXXX`\n" " rm $DMTCP_SRUN_HELPER_SYNCFILE\n" " dmtcp_srun_helper -r $srun_path \"$llaunch\"\n" " if [ ! -f $DMTCP_SRUN_HELPER_SYNCFILE ]; then\n" " echo \"Error launching application\"\n" " exit 1\n" " fi\n" " # export helper contact info\n" " . $DMTCP_SRUN_HELPER_SYNCFILE\n" " pass_slurm_helper_contact \"$DMTCP_LAUNCH_CKPTS\"\n" " rm $DMTCP_SRUN_HELPER_SYNCFILE\n" " dmtcp_restart --join --coord-host $DMTCP_COORD_HOST" " --coord-port $DMTCP_COORD_PORT" " $DMTCP_LAUNCH_CKPTS\n" " else\n" " DMTCP_REMLAUNCH_0_0=\"$DMTCP_REMLAUNCH_0_0" " $DMTCP_LAUNCH_CKPTS\"\n" " $srun_path \"$llaunch\"\n" " fi\n" " exit 0\n" " elif [ $RES_MANAGER = \"TORQUE\" ]; then\n" " #eval $(dmtcp_discover_rm \"$worker_ckpts\")\n" " #if [ -n \"$new_worker_ckpts\" ]; then\n" " # worker_ckpts=\"$new_worker_ckpts\"\n" " #fi\n" " eval $(dmtcp_discover_rm -n \"$worker_ckpts\")\n" " if [ -n \"$DMTCP_DISCOVER_RM_ERROR\" ]; then\n" " echo \"Restart error: $DMTCP_DISCOVER_RM_ERROR\"\n" " echo \"Allocated resources: $manager_resources\"\n" " exit 0\n" " fi\n" " arguments=\"PATH=$PATH DMTCP_COORD_HOST=$DMTCP_COORD_HOST" " DMTCP_COORD_PORT=$DMTCP_COORD_PORT\"\n" " arguments=$arguments\" DMTCP_CHECKPOINT_INTERVAL=$DMTCP_CHECKPOINT_INTERVAL\"\n" " arguments=$arguments\" DMTCP_TMPDIR=$DMTCP_TMPDIR\"\n" " arguments=$arguments\" DMTCP_REMLAUNCH_NODES=$DMTCP_REMLAUNCH_NODES\"\n" " bound=$(($DMTCP_REMLAUNCH_NODES - 1))\n" " for i in $(seq 0 $bound); do\n" " eval \"val=\\${DMTCP_REMLAUNCH_${i}_SLOTS}\"\n" " arguments=$arguments\" DMTCP_REMLAUNCH_${i}_SLOTS=\\\"$val\\\"\"\n" " bound2=$(($val - 1))\n" " for j in $(seq 0 $bound2); do\n" " eval \"ckpts=\\${DMTCP_REMLAUNCH_${i}_${j}}\"\n" " arguments=$arguments\" DMTCP_REMLAUNCH_${i}_${j}=\\\"$ckpts\\\"\"\n" " done\n" " done\n" " pbsdsh -u \"$llaunch\" \"$arguments\"\n" " exit 0\n" " fi\n" " fi\n" "fi\n" "\n\n" ); fprintf ( fp, "%s", multiHostProcessing ); } fclose ( fp ); { string filename = RESTART_SCRIPT_BASENAME "." RESTART_SCRIPT_EXT; string dirname = jalib::Filesystem::DirName(uniqueFilename); int dirfd = open(dirname.c_str(), O_DIRECTORY | O_RDONLY); JASSERT(dirfd != -1) (dirname) (JASSERT_ERRNO); /* Set execute permission for user. */ struct stat buf; JASSERT(::stat(uniqueFilename.c_str(), &buf) == 0); JASSERT(chmod(uniqueFilename.c_str(), buf.st_mode | S_IXUSR) == 0); // Create a symlink from // dmtcp_restart_script.sh -> dmtcp_restart_script_<curCompId>.sh unlink(filename.c_str()); JTRACE("linking \"dmtcp_restart_script.sh\" filename to uniqueFilename") (filename) (dirname) (uniqueFilename); // FIXME: Handle error case of symlink() JWARNING(symlinkat(basename(uniqueFilename.c_str()), dirfd, filename.c_str()) == 0) (JASSERT_ERRNO); JASSERT(close(dirfd) == 0); } return uniqueFilename; }
bool UniquePid::operator== ( const UniquePid& that ) const { return _hostid==that.hostid() && _pid==that.pid() && _time==that.time(); }