Beispiel #1
0
/*
 * Setup a virtual coordinator. It's part of the running process (i.e., no
 * separate process is created).
 *
 * FIXME: This is the only place in this file where we use JSocket. May be get
 * rid of it here too?
 */
void
setupVirtualCoordinator(CoordinatorInfo *coordInfo, struct in_addr *localIP)
{
  string host = "";
  int port;
  getCoordHostAndPort(COORD_NONE, host, &port);
  jalib::JSocket sock =
    jalib::JServerSocket(jalib::JSockAddr::ANY, port).sockfd();
  JASSERT(sock.isValid()) (port) (JASSERT_ERRNO)
    .Text("Failed to create listen socket.");

  Util::changeFd(sock.sockfd(), PROTECTED_COORD_FD);
  JASSERT(Util::isValidFd(coordinatorSocket));

  setCoordPort(sock.port());

  pid_t ppid = getppid();
  Util::setVirtualPidEnvVar(INITIAL_VIRTUAL_PID, ppid, ppid);

  UniquePid coordId = UniquePid(INITIAL_VIRTUAL_PID,
                                UniquePid::ThisProcess().hostid(),
                                UniquePid::ThisProcess().time());

  coordInfo->id = coordId.upid();
  coordInfo->timeStamp = coordId.time();
  coordInfo->addrLen = 0;
  if (getenv(ENV_VAR_CKPT_INTR) != NULL) {
    coordInfo->interval = (uint32_t)strtol(getenv(ENV_VAR_CKPT_INTR), NULL, 0);
  } else {
    coordInfo->interval = 0;
  }
  memset(&coordInfo->addr, 0, sizeof(coordInfo->addr));
  memset(localIP, 0, sizeof(*localIP));
}
Beispiel #2
0
EXTERNC const char* dmtcp_get_computation_id_str(void)
{
  static string *compid_str = NULL;
  if (compid_str == NULL) {
    UniquePid compId = SharedData::getCompId();
    compid_str = new string(compId.toString());
  }
  return compid_str->c_str();
}
Beispiel #3
0
string writeScript(const string& ckptDir,
                   bool uniqueCkptFilenames,
                   const time_t& ckptTimeStamp,
                   const uint32_t theCheckpointInterval,
                   const int thePort,
                   const UniquePid& compId,
                   const map<string, vector<string> >& restartFilenames)
{
  ostringstream o;
  string uniqueFilename;

  o << string(ckptDir) << "/"
    << RESTART_SCRIPT_BASENAME << "_" << compId;
  if (uniqueCkptFilenames) {
    o << "_" << std::setw(5) << std::setfill('0') << compId.computationGeneration();
  }
  o << "." << RESTART_SCRIPT_EXT;
  uniqueFilename = o.str();

  const bool isSingleHost = (restartFilenames.size() == 1);

  map< string, vector<string> >::const_iterator host;

  size_t numPeers;
  for (host = restartFilenames.begin();
       host != restartFilenames.end();
       host++) {
    numPeers += host->second.size();
  }

  vector<string>::const_iterator file;

  char hostname[80];
  char timestamp[80];
  gethostname ( hostname, 80 );

  JTRACE ( "writing restart script" ) ( uniqueFilename );

  FILE* fp = fopen ( uniqueFilename.c_str(),"w" );
  JASSERT ( fp!=0 )(JASSERT_ERRNO)( uniqueFilename )
    .Text ( "failed to open file" );

  fprintf ( fp, "%s", header );
  fprintf ( fp, "%s", checkLocal );
  fprintf ( fp, "%s", slurmHelperContactFunction );
  fprintf ( fp, "%s", usage );

  ctime_r(&ckptTimeStamp, timestamp);
  // Remove the trailing '\n'
  timestamp[strlen(timestamp) - 1] = '\0';
  fprintf ( fp, "ckpt_timestamp=\"%s\"\n\n", timestamp );

  fprintf ( fp, "coord_host=$" ENV_VAR_NAME_HOST "\n"
                "if test -z \"$" ENV_VAR_NAME_HOST "\"; then\n"
                "  coord_host=%s\nfi\n\n"
                "coord_port=$" ENV_VAR_NAME_PORT "\n"
                "if test -z \"$" ENV_VAR_NAME_PORT "\"; then\n"
                "  coord_port=%d\nfi\n\n"
                "checkpoint_interval=$" ENV_VAR_CKPT_INTR "\n"
                "if test -z \"$" ENV_VAR_CKPT_INTR "\"; then\n"
                "  checkpoint_interval=%d\nfi\n"
                "export DMTCP_CHECKPOINT_INTERVAL=${checkpoint_interval}\n\n",
                hostname, thePort, theCheckpointInterval );

  fprintf ( fp, "%s", cmdlineArgHandler );

  fprintf ( fp, "dmt_rstr_cmd=%s/" DMTCP_RESTART_CMD "\n"
                "which $dmt_rstr_cmd > /dev/null 2>&1"
                " || dmt_rstr_cmd=" DMTCP_RESTART_CMD "\n"
                "which $dmt_rstr_cmd > /dev/null 2>&1"
                " || echo \"$0: $dmt_rstr_cmd not found\"\n"
                "which $dmt_rstr_cmd > /dev/null 2>&1 || exit 1\n\n",
                jalib::Filesystem::GetProgramDir().c_str());

  fprintf ( fp, "# Number of hosts in the computation = %zu\n"
                "# Number of processes in the computation = %zu\n\n",
                restartFilenames.size(), numPeers );

  if ( isSingleHost ) {
    JTRACE ( "Single HOST" );

    host=restartFilenames.begin();
    ostringstream o;
    for ( file=host->second.begin(); file!=host->second.end(); ++file ) {
      o << " " << *file;
    }
    fprintf ( fp, "given_ckpt_files=\"%s\"\n\n", o.str().c_str());

    fprintf ( fp, "%s", singleHostProcessing );
  }
  else
  {
    fprintf ( fp, "%s",
              "# SYNTAX:\n"
              "#  :: <HOST> :<MODE>: <CHECKPOINT_IMAGE> ...\n"
              "# Host names and filenames must not include \':\'\n"
              "# At most one fg (foreground) mode allowed; it must be last.\n"
              "# \'maybexterm\' and \'maybebg\' are set from <MODE>.\n");

    fprintf ( fp, "%s", "worker_ckpts=\'" );
    for ( host=restartFilenames.begin(); host!=restartFilenames.end(); ++host ) {
      fprintf ( fp, "\n :: %s :bg:", host->first.c_str() );
      for ( file=host->second.begin(); file!=host->second.end(); ++file ) {
        fprintf ( fp," %s", file->c_str() );
      }
    }
    fprintf ( fp, "%s", "\n\'\n\n" );

    fprintf( fp,  "# Check for resource manager\n"
                  "ibrun_path=$(which ibrun 2> /dev/null)\n"
                  "if [ ! -n \"$ibrun_path\" ]; then\n"
                  "  discover_rm_path=$(which dmtcp_discover_rm)\n"
                  "  if [ -n \"$discover_rm_path\" ]; then\n"
                  "    eval $(dmtcp_discover_rm -t)\n"
                  "    srun_path=$(which srun 2> /dev/null)\n"
                  "    llaunch=`which dmtcp_rm_loclaunch`\n"
                  "    if [ $RES_MANAGER = \"SLURM\" ] && [ -n \"$srun_path\" ]; then\n"
                  "      eval $(dmtcp_discover_rm -n \"$worker_ckpts\")\n"
                  "      if [ -n \"$DMTCP_DISCOVER_RM_ERROR\" ]; then\n"
                  "          echo \"Restart error: $DMTCP_DISCOVER_RM_ERROR\"\n"
                  "          echo \"Allocated resources: $manager_resources\"\n"
                  "          exit 0\n"
                  "      fi\n"
                  "      export DMTCP_REMLAUNCH_NODES=$DMTCP_REMLAUNCH_NODES\n"
                  "      bound=$(($DMTCP_REMLAUNCH_NODES - 1))\n"
                  "      for i in $(seq 0 $bound); do\n"
                  "        eval \"val=\\${DMTCP_REMLAUNCH_${i}_SLOTS}\"\n"
                  "        export DMTCP_REMLAUNCH_${i}_SLOTS=\"$val\"\n"
                  "        bound2=$(($val - 1))\n"
                  "        for j in $(seq 0 $bound2); do\n"
                  "          eval \"ckpts=\\${DMTCP_REMLAUNCH_${i}_${j}}\"\n"
                  "          export DMTCP_REMLAUNCH_${i}_${j}=\"$ckpts\"\n"
                  "        done\n"
                  "      done\n"
                  "      if [ \"$DMTCP_DISCOVER_PM_TYPE\" = \"HYDRA\" ]; then\n"
                  "        export DMTCP_SRUN_HELPER_SYNCFILE=`mktemp ./tmp.XXXXXXXXXX`\n"
                  "        rm $DMTCP_SRUN_HELPER_SYNCFILE\n"
                  "        dmtcp_srun_helper -r $srun_path \"$llaunch\"\n"
                  "        if [ ! -f $DMTCP_SRUN_HELPER_SYNCFILE ]; then\n"
                  "          echo \"Error launching application\"\n"
                  "          exit 1\n"
                  "        fi\n"
                  "        # export helper contact info\n"
                  "        . $DMTCP_SRUN_HELPER_SYNCFILE\n"
                  "        pass_slurm_helper_contact \"$DMTCP_LAUNCH_CKPTS\"\n"
                  "        rm $DMTCP_SRUN_HELPER_SYNCFILE\n"
                  "        dmtcp_restart --join --coord-host $DMTCP_COORD_HOST"
                              " --coord-port $DMTCP_COORD_PORT"
                              " $DMTCP_LAUNCH_CKPTS\n"
                  "      else\n"
                  "        DMTCP_REMLAUNCH_0_0=\"$DMTCP_REMLAUNCH_0_0"
                                                     " $DMTCP_LAUNCH_CKPTS\"\n"
                  "        $srun_path \"$llaunch\"\n"
                  "      fi\n"
                  "      exit 0\n"
                  "    elif [ $RES_MANAGER = \"TORQUE\" ]; then\n"
                  "      #eval $(dmtcp_discover_rm \"$worker_ckpts\")\n"
                  "      #if [ -n \"$new_worker_ckpts\" ]; then\n"
                  "      #  worker_ckpts=\"$new_worker_ckpts\"\n"
                  "      #fi\n"
                  "      eval $(dmtcp_discover_rm -n \"$worker_ckpts\")\n"
                  "      if [ -n \"$DMTCP_DISCOVER_RM_ERROR\" ]; then\n"
                  "          echo \"Restart error: $DMTCP_DISCOVER_RM_ERROR\"\n"
                  "          echo \"Allocated resources: $manager_resources\"\n"
                  "          exit 0\n"
                  "      fi\n"
                  "      arguments=\"PATH=$PATH DMTCP_COORD_HOST=$DMTCP_COORD_HOST"
                                      " DMTCP_COORD_PORT=$DMTCP_COORD_PORT\"\n"
                  "      arguments=$arguments\" DMTCP_CHECKPOINT_INTERVAL=$DMTCP_CHECKPOINT_INTERVAL\"\n"
                  "      arguments=$arguments\" DMTCP_TMPDIR=$DMTCP_TMPDIR\"\n"
                  "      arguments=$arguments\" DMTCP_REMLAUNCH_NODES=$DMTCP_REMLAUNCH_NODES\"\n"
                  "      bound=$(($DMTCP_REMLAUNCH_NODES - 1))\n"
                  "      for i in $(seq 0 $bound); do\n"
                  "        eval \"val=\\${DMTCP_REMLAUNCH_${i}_SLOTS}\"\n"
                  "        arguments=$arguments\" DMTCP_REMLAUNCH_${i}_SLOTS=\\\"$val\\\"\"\n"
                  "        bound2=$(($val - 1))\n"
                  "        for j in $(seq 0 $bound2); do\n"
                  "          eval \"ckpts=\\${DMTCP_REMLAUNCH_${i}_${j}}\"\n"
                  "          arguments=$arguments\" DMTCP_REMLAUNCH_${i}_${j}=\\\"$ckpts\\\"\"\n"
                  "        done\n"
                  "      done\n"
                  "      pbsdsh -u \"$llaunch\" \"$arguments\"\n"
                  "      exit 0\n"
                  "    fi\n"
                  "  fi\n"
                  "fi\n"
                  "\n\n"
             );

    fprintf ( fp, "%s", multiHostProcessing );
  }

  fclose ( fp );
  {
    string filename = RESTART_SCRIPT_BASENAME "." RESTART_SCRIPT_EXT;
    string dirname = jalib::Filesystem::DirName(uniqueFilename);
    int dirfd = open(dirname.c_str(), O_DIRECTORY | O_RDONLY);
    JASSERT(dirfd != -1) (dirname) (JASSERT_ERRNO);

    /* Set execute permission for user. */
    struct stat buf;
    JASSERT(::stat(uniqueFilename.c_str(), &buf) == 0);
    JASSERT(chmod(uniqueFilename.c_str(), buf.st_mode | S_IXUSR) == 0);
    // Create a symlink from
    //   dmtcp_restart_script.sh -> dmtcp_restart_script_<curCompId>.sh
    unlink(filename.c_str());
    JTRACE("linking \"dmtcp_restart_script.sh\" filename to uniqueFilename")
      (filename) (dirname) (uniqueFilename);
    // FIXME:  Handle error case of symlink()
    JWARNING(symlinkat(basename(uniqueFilename.c_str()), dirfd, filename.c_str()) == 0) (JASSERT_ERRNO);
    JASSERT(close(dirfd) == 0);
  }
  return uniqueFilename;
}
Beispiel #4
0
bool UniquePid::operator== ( const UniquePid& that ) const
{
  return _hostid==that.hostid()
         && _pid==that.pid()
         && _time==that.time();
}