Esempio n. 1
0
void dmtcp::ProcessInfo::restart()
{
  JASSERT(mprotect((void*)_restoreBufAddr, _restoreBufLen, PROT_NONE) == 0)
    ((void*)_restoreBufAddr) (_restoreBufLen) (JASSERT_ERRNO);

  restoreHeap();

  // Update the ckptDir
  string ckptDir = jalib::Filesystem::GetDeviceName(PROTECTED_CKPT_DIR_FD);
  JASSERT(ckptDir.length() > 0);
  _real_close(PROTECTED_CKPT_DIR_FD);
  updateCkptDirFileSubdir(ckptDir);

  if (_launchCWD != _ckptCWD) {
    dmtcp::string rpath = "";
    size_t llen = _launchCWD.length();
    if (Util::strStartsWith(_ckptCWD.c_str(), _launchCWD.c_str()) &&
        _ckptCWD[llen] == '/') {
      // _launchCWD = "/A/B"; _ckptCWD = "/A/B/C" -> rpath = "./c"
      rpath = "./" + _ckptCWD.substr(llen + 1);
      if (chdir(rpath.c_str()) == 0) {
        JTRACE("Changed cwd") (_launchCWD) (_ckptCWD) (_launchCWD + rpath);
      } else {
        JWARNING(chdir(_ckptCWD.c_str()) == 0) (_ckptCWD) (_launchCWD)
          (JASSERT_ERRNO) .Text("Failed to change directory to _ckptCWD");
      }
    }
  }
}
Esempio n. 2
0
void petabricks::IterationDefinition::genScratchRegionLoopBegin(CodeGenerator& o){
  if(isSingleCall()){
    genLoopBegin(o);
  }else{
    o.comment("Iterate along all the directions");

    // Compute size
    for(size_t i=0; i<_size.size(); ++i){
      o.write("int " + _size[i]->toString() + " = " + _end[i]->toString() +
              " - " + _begin[i]->toString() + ";");
    }

    for(size_t i=0; i<_var.size(); ++i){
      FormulaPtr b= new FormulaLiteral<int>(0);
      FormulaPtr e=_size[i];
      FormulaPtr s=_step[i];
      FormulaPtr v=_var[i];
      //TODO: expand to reorder dimensions
      if(_order.canIterateForward(i) || !_order.canIterateBackward(i)){
        JWARNING(_order.canIterateForward(i))(_order).Text("couldn't find valid iteration order, assuming forward");
        o.beginFor(v->toString(), b, e, s);
      } else {
        o.beginReverseFor(v->toString(), b, e, s);
      }
    }
  }
}
Esempio n. 3
0
int
sendKeyValPairToCoordinator(const char *id,
                            const void *key,
                            uint32_t key_len,
                            const void *val,
                            uint32_t val_len)
{
  DmtcpMessage msg(DMT_REGISTER_NAME_SERVICE_DATA);

  JWARNING(strlen(id) < sizeof(msg.nsid));
  strncpy(msg.nsid, id, sizeof msg.nsid);
  msg.keyLen = key_len;
  msg.valLen = val_len;
  msg.extraBytes = key_len + val_len;
  int sock = coordinatorSocket;
  if (dmtcp_is_running_state()) {
    if (nsSock == -1) {
      nsSock = createNewSocketToCoordinator(COORD_ANY);
      JASSERT(nsSock != -1);
      nsSock = Util::changeFd(nsSock, PROTECTED_NS_FD);
      sock = nsSock;
      DmtcpMessage m(DMT_NAME_SERVICE_WORKER);
      JASSERT(Util::writeAll(sock, &m, sizeof(m)) == sizeof(m));
    }
    sock = nsSock;
  }

  JASSERT(Util::writeAll(sock, &msg, sizeof(msg)) == sizeof(msg));
  JASSERT(Util::writeAll(sock, key, key_len) == key_len);
  JASSERT(Util::writeAll(sock, val, val_len) == val_len);

  return 1;
}
Esempio n. 4
0
// FIXME: Handle Virtual Pids
static void
restore_term_settings()
{
  if (saved_termios_exists) {
    /* First check if we are in foreground. If not, skip this and print
     *   warning.  If we try to call tcsetattr in background, we will hang up.
     */
    int foreground = (tcgetpgrp(STDIN_FILENO) == getpgrp());
    JTRACE("restore terminal attributes, check foreground status first")
      (foreground);
    if (foreground) {
      if ((!isatty(STDIN_FILENO)
           || safe_tcsetattr(STDIN_FILENO, TCSANOW, &saved_termios) == -1)) {
        JWARNING(false).Text("failed to restore terminal");
      } else {
        struct winsize cur_win;
        JTRACE("restored terminal");
        ioctl(STDIN_FILENO, TIOCGWINSZ, (char *)&cur_win);

        /* ws_row/ws_col was probably not 0/0 prior to checkpoint.  We change
         * it back to last known row/col prior to checkpoint, and then send a
         * SIGWINCH (see below) to notify process that window might have changed
         */
        if (cur_win.ws_row == 0 && cur_win.ws_col == 0) {
          ioctl(STDIN_FILENO, TIOCSWINSZ, (char *)&win);
        }
      }
    } else {
      JWARNING(false)
      .Text(":skip restore terminal step -- we are in BACKGROUND");
    }
  }

  /*
   * NOTE:
   * Apache, when running in debug mode (-X), uses SIGWINCH
   * as a signal for stopping gracefully. Please comment out
   * the next line to prevent DMTCP from sending a SIGWINCH
   * on restart when testing with Apache.
   *
   * TODO:
   * This should be done automatically by wrapping it in an ifdef
   * or if condition that disables the SIGWINCH using configure or
   * a runtime option (--no-sigwinch).
   */
  if (kill(getpid(), SIGWINCH) == -1) {}  /* No remedy if error */
}
Esempio n. 5
0
int
sendQueryAllToCoordinator(const char *id, void **buf, int *len)
{
  DmtcpMessage msg(DMT_NAME_SERVICE_QUERY_ALL);

  JWARNING(strlen(id) < sizeof(msg.nsid));
  strncpy(msg.nsid, id, sizeof msg.nsid);
  int sock = coordinatorSocket;
  if (dmtcp_is_running_state()) {
    if (nsSock == -1) {
      nsSock = createNewSocketToCoordinator(COORD_ANY);
      JASSERT(nsSock != -1);
      nsSock = Util::changeFd(nsSock, PROTECTED_NS_FD);
      JASSERT(nsSock == PROTECTED_NS_FD);
      DmtcpMessage m(DMT_NAME_SERVICE_WORKER);
      JASSERT(Util::writeAll(nsSock, &m, sizeof(m)) == sizeof(m));
    }
    sock = nsSock;
  }

  JASSERT(Util::writeAll(sock, &msg, sizeof(msg)) == sizeof(msg));
  msg.poison();

  JASSERT(Util::readAll(sock, &msg, sizeof(msg)) == sizeof(msg));
  msg.assertValid();

  JASSERT(msg.type == DMT_NAME_SERVICE_QUERY_ALL_RESPONSE &&
          msg.extraBytes == msg.valLen);

  /*
   * We can't assume anything about the size of the user-specified buffer,
   * so we read in in a safe, temporary buffer. This way there's no stale
   * data on the socket for the next reader.
   */
  void *tmp = JALLOC_HELPER_MALLOC(msg.extraBytes);
  JASSERT (Util::readAll(sock, tmp, msg.extraBytes) == msg.extraBytes);

  if (*len > 0) {
    if ((size_t)*len < msg.extraBytes) {
      JALLOC_HELPER_FREE(tmp);
      errno = ERANGE;
      return -1;
    } else {
      memcpy(*buf, tmp, msg.extraBytes);
      *len = msg.extraBytes;
      JALLOC_HELPER_FREE(tmp);
      return 0;
    }
  } else if (*len == 0) {
    // Caller must free this buffer
    *buf = tmp;
    *len = msg.extraBytes;
    return 0;
  }

  JALLOC_HELPER_FREE(tmp);
  errno = EINVAL;
  return -1;
}
Esempio n. 6
0
static void
timeout_handler(int sig, siginfo_t *si, void *uc)
{
  JWARNING(false).Text("Checkpoint took longer than expected.");
  fflush(stdout);
  if (g_action == PRINT_WARNING_AND_EXIT) {
    JASSERT(false)("Killing the application.");
  }
  signal(sig, SIG_IGN);
}
Esempio n. 7
0
// shutdownMtcpEngineOnFork will dlclose the old libmtcp.so and will
//   dlopen a new libmtcp.so.  DmtcpWorker constructor then calls
//   initializeMtcpEngine, which will then call mtcp_init.  We must close
//   the old SIG_CKPT handler prior to this, so that MTCP and mtcp_init()
//   don't think someone else is using their SIG_CKPT signal.
void dmtcp::shutdownMtcpEngineOnFork()
{
    // Remove our signal handler from our SIG_CKPT
    errno = 0;
    JWARNING (SIG_ERR != _real_signal(dmtcp::DmtcpWorker::determineMtcpSignal(),
                                      SIG_DFL))
    (dmtcp::DmtcpWorker::determineMtcpSignal())
    (JASSERT_ERRNO)
    .Text("failed to reset child's checkpoint signal on fork");
    get_mtcp_symbol ( REOPEN_MTCP );
}
Esempio n. 8
0
void GpuManager::shutdown() {
  if(!_useOpenCL()) return;
  if(_shutdown) return;
  _shutdown = true;
  int rv = pthread_join(_thread, NULL);
  JWARNING(rv==0)(rv).Text("pthread_join failed");
  OpenCLUtil::deinit();
  #ifdef GPU_TRACE
  std::cout << "pthead_join~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" << std::endl;
  #endif
}
Esempio n. 9
0
void dmtcp::ProcessInfo::insertChild(pid_t pid, dmtcp::UniquePid uniquePid)
{
  _do_lock_tbl();
  iterator i = _childTable.find(pid);
  JWARNING(i == _childTable.end()) (pid) (uniquePid) (i->second)
    .Text("child pid already exists!");

  _childTable[pid] = uniquePid;
  _do_unlock_tbl();

  JTRACE("Creating new virtualPid -> realPid mapping.") (pid) (uniquePid);
}
Esempio n. 10
0
int getUniqueIdFromCoordinator(const char *id,
                               const void *key,
                               uint32_t key_len,
                               void *val,
                               uint32_t *val_len,
                               uint32_t offset /* = 1 */)
{
  DmtcpMessage msg(DMT_NAME_SERVICE_GET_UNIQUE_ID);

  JWARNING(strlen(id) < sizeof(msg.nsid));
  strncpy(msg.nsid, id, sizeof msg.nsid);
  msg.keyLen = key_len;
  msg.valLen = 0;
  msg.extraBytes = key_len;
  msg.uniqueIdOffset = offset;
  msg.valLen = *val_len;
  int sock = coordinatorSocket;

  if (key == NULL || key_len == 0 || val == NULL || val_len == 0) {
    return 0;
  }

  if (dmtcp_is_running_state()) {
    if (nsSock == -1) {
      nsSock = createNewSocketToCoordinator(COORD_ANY);
      JASSERT(nsSock != -1);
      nsSock = Util::changeFd(nsSock, PROTECTED_NS_FD);
      JASSERT(nsSock == PROTECTED_NS_FD);
      DmtcpMessage m(DMT_NAME_SERVICE_WORKER);
      JASSERT(Util::writeAll(nsSock, &m, sizeof(m)) == sizeof(m));
    }
    sock = nsSock;
  }

  JASSERT(Util::writeAll(sock, &msg, sizeof(msg)) == sizeof(msg));
  JASSERT(Util::writeAll(sock, key, key_len) == key_len);

  msg.poison();

  JASSERT(Util::readAll(sock, &msg, sizeof(msg)) == sizeof(msg));
  msg.assertValid();
  JASSERT(msg.type == DMT_NAME_SERVICE_GET_UNIQUE_ID_RESPONSE &&
          msg.extraBytes == msg.valLen);

  JASSERT(*val_len >= msg.valLen);
  *val_len = msg.valLen;
  JASSERT(Util::readAll(sock, val, *val_len) == *val_len);

  return *val_len;
}
Esempio n. 11
0
void dmtcp::ConnectionState::postCheckpoint(bool isRestart)
{
  _drain.refillAllSockets();

  ConnectionList& connections = ConnectionList::instance();
  ConnectionList::iterator i;
  for (i= connections.begin(); i != connections.end(); ++i) {
    if (_conToFds[i->first].size() <= 0) {
      JWARNING(false) (i->first.conId())
        .Text("WARNING:: stale connections should be gone by now");
    }

    if (_conToFds[i->first].size() == 0) continue;

   (i->second)->postCheckpoint(_conToFds[i->first], isRestart);
  }
}
Esempio n. 12
0
void dmtcp::ProcessInfo::restoreProcessGroupInfo()
{
  // Restore group assignment
  if (dmtcp_virtual_to_real_pid && dmtcp_virtual_to_real_pid(_gid) != _gid) {
    pid_t cgid = getpgid(0);
    // Group ID is known inside checkpointed processes
    if (_gid != cgid) {
      JTRACE("Restore Group Assignment")
        (_gid) (_fgid) (cgid) (_pid) (_ppid) (getppid());
      JWARNING(setpgid(0, _gid) == 0) (_gid) (JASSERT_ERRNO)
        .Text("Cannot change group information");
    } else {
      JTRACE("Group is already assigned") (_gid) (cgid);
    }
  } else {
    JTRACE("SKIP Group information, GID unknown");
  }
}
Esempio n. 13
0
static bool
initialize_and_start_perf_attr(struct perf_event_attr *pes, int i, __u32 type, __u64 config)
{
  JASSERT(pes);
  pes->type = type;
  pes->size = sizeof(struct perf_event_attr);
  pes->config = config;
  pes->disabled = 1;
  pes->exclude_kernel = 1;
  pes->exclude_hv = 1;
  fd[i] = perf_event_open1(pes, 0, -1, -1, 0);
  if (fd[i] < 0) {
    JWARNING(false)("Error opening leader\n")(pes->config);
    ioctl(fd[i], PERF_EVENT_IOC_DISABLE, 0);
    return false;
  }
  ioctl(fd[i], PERF_EVENT_IOC_RESET, 0);
  ioctl(fd[i], PERF_EVENT_IOC_ENABLE, 0);
  return true;
}
Esempio n. 14
0
void dmtcp::ProcessInfo::restoreProcessGroupInfo()
{
  // FIXME: This needs to be fixed
#if 0
#ifdef PID_VIRTUALIZATION
  // Restore group assignment
  if( VirtualPidTable::instance().pidExists(_gid) ){
    pid_t cgid = getpgid(0);
    // Group ID is known inside checkpointed processes
    if( _gid != cgid && _pid != _gid ){
      JTRACE("Restore Group Assignment")
        ( _gid ) ( _fgid ) ( cgid ) ( _pid ) ( _ppid ) ( getppid() );
      JWARNING( setpgid(0,_gid) == 0 ) (_gid) (JASSERT_ERRNO)
        .Text("Cannot change group information");
    }else{
      JTRACE("Group is already assigned")(_gid)(cgid);
    }
  }else{
    JTRACE("SKIP Group information, GID unknown");
  }
#endif
#endif
}
Esempio n. 15
0
static int ptsname_r_work(int fd, char * buf, size_t buflen)
{
  JTRACE("Calling ptsname_r");

  dmtcp::Connection* c = &dmtcp::KernelDeviceToConnection::instance().retrieve(fd);
  dmtcp::PtyConnection* ptyCon =(dmtcp::PtyConnection*) c;

  dmtcp::string uniquePtsName = ptyCon->uniquePtsName();

  JTRACE("ptsname_r") (uniquePtsName);

  if (uniquePtsName.length() >= buflen)
  {
    JWARNING(false) (uniquePtsName) (uniquePtsName.length()) (buflen)
      .Text("fake ptsname() too long for user buffer");
    errno = ERANGE;
    return -1;
  }

  strcpy(buf, uniquePtsName.c_str());

  return 0;
}
Esempio n. 16
0
void petabricks::IterationDefinition::genLoopBegin(CodeGenerator& o){
  if(isSingleCall()){
    o.write("{");
    for(size_t i=0; i<_var.size(); ++i){
      o.varDecl("const IndexT "+_var[i]->toString()+" = "+_begin[i]->toString());
    }
  }else{
    o.comment("Iterate along all the directions");
    for(size_t i=0; i<_var.size(); ++i){
      FormulaPtr b=_begin[i];
      FormulaPtr e=_end[i];
      FormulaPtr s=_step[i];
      FormulaPtr v=_var[i];
      //TODO: expand to reorder dimensions
      if(_order.canIterateForward(i) || !_order.canIterateBackward(i)){
        JWARNING(_order.canIterateForward(i))(_order).Text("couldn't find valid iteration order, assuming forward");
        o.beginFor(v->toString(), b, e, s);
      } else {
        o.beginReverseFor(v->toString(), b, e, s);
      }
    }
  }
}
Esempio n. 17
0
void dmtcp::ConnectionState::postRestart()
{
  ConnectionList& connections = ConnectionList::instance();

  // Two part restoreOptions. See the comments in doReconnect()
  // Part 1: Restore options for all but Pseudo-terminal slaves
  ConnectionList::iterator i;
  for (i= connections.begin(); i != connections.end(); ++i) {
    JWARNING(_conToFds[i->first].size() > 0)
      .Text("stale connections should be gone by now");
    if (_conToFds[i->first].size() == 0) continue;

    if ((i->second)->conType() == Connection::PTY &&
         (((PtyConnection*) (i->second))->ptyType() == PtyConnection::PTY_SLAVE ||
           ((PtyConnection*) (i->second))->ptyType() == PtyConnection::PTY_BSD_SLAVE)) { }
    else {
      (i->second)->restoreOptions(_conToFds[i->first]);
    }
  }

  // Part 2: Restore options for all Pseudo-terminal slaves
  for (i= connections.begin(); i != connections.end(); ++i) {
    if (_conToFds[i->first].size() == 0) continue;

    if ((i->second)->conType() == Connection::PTY &&
         (((PtyConnection*) (i->second))->ptyType() == PtyConnection::PTY_SLAVE ||
           ((PtyConnection*) (i->second))->ptyType() == PtyConnection::PTY_BSD_SLAVE)) {
      (i->second)->restoreOptions(_conToFds[i->first]);
    }
  }

  KernelDeviceToConnection::instance().dbgSpamFds();

  //fix our device table to match the new world order
  KernelDeviceToConnection::instance() = KernelDeviceToConnection(_conToFds);
}
Esempio n. 18
0
void startCtrsSignalHandler(int sig, siginfo_t *si, void *uc)
{
  JWARNING(setup_perf_ctr()).Text("Error setting up perf ctrs.");
}
Esempio n. 19
0
void dmtcp_event_hook(DmtcpEvent_t event, DmtcpEventData_t *data)
{
  static char *filename = NULL;
  static bool restartingFromCkpt = false;
  static FILE *outfp = NULL;

  switch (event) {
    case DMTCP_EVENT_INIT:
      {
        if (!getenv("DMTCP_START_CTRS_ON_RESTART_STRATEGY")) {
          setup_handlers();
          filename = getStatsFilename(getenv("STATFILE"));
          JWARNING(filename != NULL).Text("Could not get the stats filename in the init event.");
          JTRACE("Filename: ")(filename);
        }
      }
      break;

    case DMTCP_EVENT_WRITE_CKPT:
      {
        JTRACE("CHKP");
        if (getenv("DMTCP_START_CTRS_ON_RESTART_STRATEGY")) {
          filename = getenv("STATFILE");
          if (restartingFromCkpt) {
            JTRACE("WRITE CHKP");
            JASSERT(filename);
            outfp = fopen(filename, "w+");
            if (!outfp) {
              perror("Error opening stats file in w+ mode");
              JASSERT(false);
            }
            read_ctrs(outfp);
            fclose(outfp);
            restartingFromCkpt = false;
          }
        }
      }
      break;

    case DMTCP_EVENT_RESUME:
      {
        if (getenv("DMTCP_KILL_ON_RESUME_STRATEGY")) {
          exit(0);
        }
      }
      break;

    case DMTCP_EVENT_RESTART:
      {
        if (getenv("DMTCP_START_CTRS_ON_RESTART_STRATEGY")) {
          restartingFromCkpt = true;
          filename = getStatsFilename(getenv("STATFILE"));
          JWARNING(filename != NULL).Text("Could not get the stats filename in the restart event.");
          JTRACE("Filename: ")(filename);
          JWARNING(setup_perf_ctr()).Text("Error setting up perf ctrs.");
        }
      }
      break;

    case DMTCP_EVENT_RESUME_USER_THREAD:
      {
        if (getenv("DMTCP_START_CTRS_ON_RESTART_STRATEGY")) {
          filename = getStatsFilename(getenv("STATFILE"));
          JWARNING(filename != NULL).Text("Could not get the stats filename in the resume_user_thread event.");
          JTRACE("Filename: ")(filename);
        }
      }
      break;

    default:
      break;
  }
  DMTCP_NEXT_EVENT_HOOK(event, data);
}
Esempio n. 20
0
string writeScript(const string& ckptDir,
                   bool uniqueCkptFilenames,
                   const time_t& ckptTimeStamp,
                   const uint32_t theCheckpointInterval,
                   const int thePort,
                   const UniquePid& compId,
                   const map<string, vector<string> >& restartFilenames)
{
  ostringstream o;
  string uniqueFilename;

  o << string(ckptDir) << "/"
    << RESTART_SCRIPT_BASENAME << "_" << compId;
  if (uniqueCkptFilenames) {
    o << "_" << std::setw(5) << std::setfill('0') << compId.computationGeneration();
  }
  o << "." << RESTART_SCRIPT_EXT;
  uniqueFilename = o.str();

  const bool isSingleHost = (restartFilenames.size() == 1);

  map< string, vector<string> >::const_iterator host;

  size_t numPeers;
  for (host = restartFilenames.begin();
       host != restartFilenames.end();
       host++) {
    numPeers += host->second.size();
  }

  vector<string>::const_iterator file;

  char hostname[80];
  char timestamp[80];
  gethostname ( hostname, 80 );

  JTRACE ( "writing restart script" ) ( uniqueFilename );

  FILE* fp = fopen ( uniqueFilename.c_str(),"w" );
  JASSERT ( fp!=0 )(JASSERT_ERRNO)( uniqueFilename )
    .Text ( "failed to open file" );

  fprintf ( fp, "%s", header );
  fprintf ( fp, "%s", checkLocal );
  fprintf ( fp, "%s", slurmHelperContactFunction );
  fprintf ( fp, "%s", usage );

  ctime_r(&ckptTimeStamp, timestamp);
  // Remove the trailing '\n'
  timestamp[strlen(timestamp) - 1] = '\0';
  fprintf ( fp, "ckpt_timestamp=\"%s\"\n\n", timestamp );

  fprintf ( fp, "coord_host=$" ENV_VAR_NAME_HOST "\n"
                "if test -z \"$" ENV_VAR_NAME_HOST "\"; then\n"
                "  coord_host=%s\nfi\n\n"
                "coord_port=$" ENV_VAR_NAME_PORT "\n"
                "if test -z \"$" ENV_VAR_NAME_PORT "\"; then\n"
                "  coord_port=%d\nfi\n\n"
                "checkpoint_interval=$" ENV_VAR_CKPT_INTR "\n"
                "if test -z \"$" ENV_VAR_CKPT_INTR "\"; then\n"
                "  checkpoint_interval=%d\nfi\n"
                "export DMTCP_CHECKPOINT_INTERVAL=${checkpoint_interval}\n\n",
                hostname, thePort, theCheckpointInterval );

  fprintf ( fp, "%s", cmdlineArgHandler );

  fprintf ( fp, "dmt_rstr_cmd=%s/" DMTCP_RESTART_CMD "\n"
                "which $dmt_rstr_cmd > /dev/null 2>&1"
                " || dmt_rstr_cmd=" DMTCP_RESTART_CMD "\n"
                "which $dmt_rstr_cmd > /dev/null 2>&1"
                " || echo \"$0: $dmt_rstr_cmd not found\"\n"
                "which $dmt_rstr_cmd > /dev/null 2>&1 || exit 1\n\n",
                jalib::Filesystem::GetProgramDir().c_str());

  fprintf ( fp, "# Number of hosts in the computation = %zu\n"
                "# Number of processes in the computation = %zu\n\n",
                restartFilenames.size(), numPeers );

  if ( isSingleHost ) {
    JTRACE ( "Single HOST" );

    host=restartFilenames.begin();
    ostringstream o;
    for ( file=host->second.begin(); file!=host->second.end(); ++file ) {
      o << " " << *file;
    }
    fprintf ( fp, "given_ckpt_files=\"%s\"\n\n", o.str().c_str());

    fprintf ( fp, "%s", singleHostProcessing );
  }
  else
  {
    fprintf ( fp, "%s",
              "# SYNTAX:\n"
              "#  :: <HOST> :<MODE>: <CHECKPOINT_IMAGE> ...\n"
              "# Host names and filenames must not include \':\'\n"
              "# At most one fg (foreground) mode allowed; it must be last.\n"
              "# \'maybexterm\' and \'maybebg\' are set from <MODE>.\n");

    fprintf ( fp, "%s", "worker_ckpts=\'" );
    for ( host=restartFilenames.begin(); host!=restartFilenames.end(); ++host ) {
      fprintf ( fp, "\n :: %s :bg:", host->first.c_str() );
      for ( file=host->second.begin(); file!=host->second.end(); ++file ) {
        fprintf ( fp," %s", file->c_str() );
      }
    }
    fprintf ( fp, "%s", "\n\'\n\n" );

    fprintf( fp,  "# Check for resource manager\n"
                  "ibrun_path=$(which ibrun 2> /dev/null)\n"
                  "if [ ! -n \"$ibrun_path\" ]; then\n"
                  "  discover_rm_path=$(which dmtcp_discover_rm)\n"
                  "  if [ -n \"$discover_rm_path\" ]; then\n"
                  "    eval $(dmtcp_discover_rm -t)\n"
                  "    srun_path=$(which srun 2> /dev/null)\n"
                  "    llaunch=`which dmtcp_rm_loclaunch`\n"
                  "    if [ $RES_MANAGER = \"SLURM\" ] && [ -n \"$srun_path\" ]; then\n"
                  "      eval $(dmtcp_discover_rm -n \"$worker_ckpts\")\n"
                  "      if [ -n \"$DMTCP_DISCOVER_RM_ERROR\" ]; then\n"
                  "          echo \"Restart error: $DMTCP_DISCOVER_RM_ERROR\"\n"
                  "          echo \"Allocated resources: $manager_resources\"\n"
                  "          exit 0\n"
                  "      fi\n"
                  "      export DMTCP_REMLAUNCH_NODES=$DMTCP_REMLAUNCH_NODES\n"
                  "      bound=$(($DMTCP_REMLAUNCH_NODES - 1))\n"
                  "      for i in $(seq 0 $bound); do\n"
                  "        eval \"val=\\${DMTCP_REMLAUNCH_${i}_SLOTS}\"\n"
                  "        export DMTCP_REMLAUNCH_${i}_SLOTS=\"$val\"\n"
                  "        bound2=$(($val - 1))\n"
                  "        for j in $(seq 0 $bound2); do\n"
                  "          eval \"ckpts=\\${DMTCP_REMLAUNCH_${i}_${j}}\"\n"
                  "          export DMTCP_REMLAUNCH_${i}_${j}=\"$ckpts\"\n"
                  "        done\n"
                  "      done\n"
                  "      if [ \"$DMTCP_DISCOVER_PM_TYPE\" = \"HYDRA\" ]; then\n"
                  "        export DMTCP_SRUN_HELPER_SYNCFILE=`mktemp ./tmp.XXXXXXXXXX`\n"
                  "        rm $DMTCP_SRUN_HELPER_SYNCFILE\n"
                  "        dmtcp_srun_helper -r $srun_path \"$llaunch\"\n"
                  "        if [ ! -f $DMTCP_SRUN_HELPER_SYNCFILE ]; then\n"
                  "          echo \"Error launching application\"\n"
                  "          exit 1\n"
                  "        fi\n"
                  "        # export helper contact info\n"
                  "        . $DMTCP_SRUN_HELPER_SYNCFILE\n"
                  "        pass_slurm_helper_contact \"$DMTCP_LAUNCH_CKPTS\"\n"
                  "        rm $DMTCP_SRUN_HELPER_SYNCFILE\n"
                  "        dmtcp_restart --join --coord-host $DMTCP_COORD_HOST"
                              " --coord-port $DMTCP_COORD_PORT"
                              " $DMTCP_LAUNCH_CKPTS\n"
                  "      else\n"
                  "        DMTCP_REMLAUNCH_0_0=\"$DMTCP_REMLAUNCH_0_0"
                                                     " $DMTCP_LAUNCH_CKPTS\"\n"
                  "        $srun_path \"$llaunch\"\n"
                  "      fi\n"
                  "      exit 0\n"
                  "    elif [ $RES_MANAGER = \"TORQUE\" ]; then\n"
                  "      #eval $(dmtcp_discover_rm \"$worker_ckpts\")\n"
                  "      #if [ -n \"$new_worker_ckpts\" ]; then\n"
                  "      #  worker_ckpts=\"$new_worker_ckpts\"\n"
                  "      #fi\n"
                  "      eval $(dmtcp_discover_rm -n \"$worker_ckpts\")\n"
                  "      if [ -n \"$DMTCP_DISCOVER_RM_ERROR\" ]; then\n"
                  "          echo \"Restart error: $DMTCP_DISCOVER_RM_ERROR\"\n"
                  "          echo \"Allocated resources: $manager_resources\"\n"
                  "          exit 0\n"
                  "      fi\n"
                  "      arguments=\"PATH=$PATH DMTCP_COORD_HOST=$DMTCP_COORD_HOST"
                                      " DMTCP_COORD_PORT=$DMTCP_COORD_PORT\"\n"
                  "      arguments=$arguments\" DMTCP_CHECKPOINT_INTERVAL=$DMTCP_CHECKPOINT_INTERVAL\"\n"
                  "      arguments=$arguments\" DMTCP_TMPDIR=$DMTCP_TMPDIR\"\n"
                  "      arguments=$arguments\" DMTCP_REMLAUNCH_NODES=$DMTCP_REMLAUNCH_NODES\"\n"
                  "      bound=$(($DMTCP_REMLAUNCH_NODES - 1))\n"
                  "      for i in $(seq 0 $bound); do\n"
                  "        eval \"val=\\${DMTCP_REMLAUNCH_${i}_SLOTS}\"\n"
                  "        arguments=$arguments\" DMTCP_REMLAUNCH_${i}_SLOTS=\\\"$val\\\"\"\n"
                  "        bound2=$(($val - 1))\n"
                  "        for j in $(seq 0 $bound2); do\n"
                  "          eval \"ckpts=\\${DMTCP_REMLAUNCH_${i}_${j}}\"\n"
                  "          arguments=$arguments\" DMTCP_REMLAUNCH_${i}_${j}=\\\"$ckpts\\\"\"\n"
                  "        done\n"
                  "      done\n"
                  "      pbsdsh -u \"$llaunch\" \"$arguments\"\n"
                  "      exit 0\n"
                  "    fi\n"
                  "  fi\n"
                  "fi\n"
                  "\n\n"
             );

    fprintf ( fp, "%s", multiHostProcessing );
  }

  fclose ( fp );
  {
    string filename = RESTART_SCRIPT_BASENAME "." RESTART_SCRIPT_EXT;
    string dirname = jalib::Filesystem::DirName(uniqueFilename);
    int dirfd = open(dirname.c_str(), O_DIRECTORY | O_RDONLY);
    JASSERT(dirfd != -1) (dirname) (JASSERT_ERRNO);

    /* Set execute permission for user. */
    struct stat buf;
    JASSERT(::stat(uniqueFilename.c_str(), &buf) == 0);
    JASSERT(chmod(uniqueFilename.c_str(), buf.st_mode | S_IXUSR) == 0);
    // Create a symlink from
    //   dmtcp_restart_script.sh -> dmtcp_restart_script_<curCompId>.sh
    unlink(filename.c_str());
    JTRACE("linking \"dmtcp_restart_script.sh\" filename to uniqueFilename")
      (filename) (dirname) (uniqueFilename);
    // FIXME:  Handle error case of symlink()
    JWARNING(symlinkat(basename(uniqueFilename.c_str()), dirfd, filename.c_str()) == 0) (JASSERT_ERRNO);
    JASSERT(close(dirfd) == 0);
  }
  return uniqueFilename;
}