示例#1
0
int getAprunArg(aprun_arg_t argt) {
  switch (argt) {
  case aprun_cc:
    return -1; // string arg/user provides this
  case aprun_n:
    return -1; // user provides this
  case aprun_d:
    return getCoresPerLocale();
  case aprun_N:
    return getLocalesPerNode();
  case aprun_j:
    return getCPUsPerCU();
  default:
    return -1;
  }
}
示例#2
0
// create the command that will actually launch the program and 
// create any files needed for the launch like the batch script 
static char* chpl_launch_create_command(int argc, char* argv[], 
                                        int32_t numLocales) {
  int i;
  int size;
  char baseCommand[256];
  char* command;
  FILE* slurmFile, *expectFile;
  char* account = getenv("CHPL_LAUNCHER_ACCOUNT");
  char* constraint = getenv("CHPL_LAUNCHER_CONSTRAINT");
  char* outputfn = getenv("CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME");
  char* basenamePtr = strrchr(argv[0], '/');
  pid_t mypid;

  // command line walltime takes precedence over env var
  if (!walltime) {
    walltime = getenv("CHPL_LAUNCHER_WALLTIME");
  }

  // command line nodelist takes precedence over env var
  if (!nodelist) {
    nodelist = getenv("CHPL_LAUNCHER_NODELIST");
  }

  if (basenamePtr == NULL) {
      basenamePtr = argv[0];
  } else {
      basenamePtr++;
  }
  chpl_compute_real_binary_name(argv[0]);

  if (debug) {
    mypid = 0;
  } else {
    mypid = getpid();
  }

  // set the filenames
  sprintf(expectFilename, "%s%d", baseExpectFilename, (int)mypid);
  sprintf(slurmFilename, "%s%d", baseSBATCHFilename, (int)mypid);

  // if were running a batch job 
  if (getenv("CHPL_LAUNCHER_USE_SBATCH") != NULL || generate_sbatch_script) {
    // open the batch file and create the header 
    slurmFile = fopen(slurmFilename, "w");
    fprintf(slurmFile, "#!/bin/sh\n\n");
    
    // set the job name 
    fprintf(slurmFile, "#SBATCH --job-name=Chpl-%.10s\n", basenamePtr);
 
    // suppress informational messages, will still display errors 
    fprintf(slurmFile, "#SBATCH --quiet\n");
    
    // request the number of locales, with 1 task per node, and number of cores
    // cpus-per-task. We probably don't need --nodes and --ntasks specified
    // since 1 task-per-node with n --tasks implies -n nodes
    fprintf(slurmFile, "#SBATCH --nodes=%d\n", numLocales);
    fprintf(slurmFile, "#SBATCH --ntasks=%d\n", numLocales);
    fprintf(slurmFile, "#SBATCH --ntasks-per-node=%d\n", procsPerNode);
    fprintf(slurmFile, "#SBATCH --cpus-per-task=%d\n", getCoresPerLocale());
    
    //request exclusive access to nodes 
    fprintf(slurmFile, "#SBATCH --exclusive\n");

    // Set the walltime if it was specified 
    if (walltime) { 
      fprintf(slurmFile, "#SBATCH --time=%s\n", walltime);
    }

    // Set the nodelist if it was specified
    if (nodelist) {
      fprintf(slurmFile, "#SBATCH --nodelist=%s\n", nodelist);
    }

    // If needed a constraint can be specified with the env var CHPL_LAUNCHER_CONSTRAINT
    if (constraint) {
      fprintf(slurmFile, "#SBATCH --constraint=%s\n", constraint);
    }

    // set the account name if one was provided  
    if (account && strlen(account) > 0) {
      fprintf(slurmFile, "#SBATCH --account=%s\n", account);
    }
 
    // set the output name to either the user specified
    // or to the binaryName.<jobID>.out if none specified
    if (outputfn!=NULL) {
      fprintf(slurmFile, "#SBATCH --output=%s\n", outputfn);
    }
    else {
      fprintf(slurmFile, "#SBATCH --output=%s.%%j.out\n", argv[0]);
    }

    // add the srun command 
    fprintf(slurmFile, "srun %s ", chpl_get_real_binary_name());
    
    // add any arguments passed to the launcher to the binary 
    for (i=1; i<argc; i++) {
      fprintf(slurmFile, " '%s'", argv[i]);
    }
    fprintf(slurmFile, "\n");

    // close the batch file and change permissions 
    fclose(slurmFile);
    chmod(slurmFilename, 0755);

    if (generate_sbatch_script) {
      fprintf(stdout, "SBATCH script written to '%s'\n", slurmFilename);
    }

    // the baseCommand is what will call the batch file
    // that was just created 
    sprintf(baseCommand, "sbatch %s\n", slurmFilename);
  }
  // else we're running an interactive job 
  else {
    // expect is used to launch an interactive job 
    // create the file and set some things for expect 
    expectFile = fopen(expectFilename, "w");
    fprintf(expectFile, "set timeout -1\n");
    fprintf(expectFile, "set prompt \"(%%|#|\\\\$|>) $\"\n");
    
    // create a silent salloc command
    fprintf(expectFile, "spawn -noecho srun ");

    // set the job name 
    fprintf(expectFile, "--job-name=CHPL-%.10s ",basenamePtr);  
    
    // suppress informational messages, will still display errors 
    fprintf(expectFile, "--quiet ");

    // request the number of locales, with 1 task per node, and number of cores
    // cpus-per-task. We probably don't need --nodes and --ntasks specified
    // since 1 task-per-node with n --tasks implies -n nodes
    fprintf(expectFile, "--nodes=%d ",numLocales); 
    fprintf(expectFile, "--ntasks=%d ", numLocales); 
    fprintf(expectFile, "--ntasks-per-node=%d ", procsPerNode); 
    fprintf(expectFile, "--cpus-per-task=%d ", getCoresPerLocale()); 
    
    // request exclusive access
    fprintf(expectFile, "--exclusive ");
    
    // Set the walltime if i was specified 
    if (walltime) {
      fprintf(expectFile, "--time=%s ", walltime); 
    }

    // Set the walltime if it was specified
    if (nodelist) {
      fprintf(expectFile, "--nodelist=%s ", nodelist);
    }

    // set any constraints 
    if (constraint) {
      fprintf(expectFile, " --constraint=%s ", constraint);
    }
    
    // set the account name if one was provided  
    if (account && strlen(account) > 0) {
      fprintf(expectFile, "--account=%s ", account);
    }

    // the actual srun command 
    fprintf(expectFile, "%s", chpl_get_real_binary_name());
    
    // add any arguments passed to the launcher to the binary 
    for (i=1; i<argc; i++) {
      fprintf(expectFile, " %s", argv[i]);
    }
    fprintf(expectFile, "\n\n");
   
    // do some things required for expect and close the file 
    fprintf(expectFile, "interact -o -re $prompt {return}\n");
    fclose(expectFile);
    
    // the baseCommand is what will call the expect file 
    sprintf(baseCommand, "expect %s", expectFilename);
  }

  // copy baseCommand into command and return it 
  size = strlen(baseCommand) + 1;
  command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, "");
  sprintf(command, "%s", baseCommand);
  if (strlen(command)+1 > size) {
    chpl_internal_error("buffer overflow");
  }
  
  return command;
}
示例#3
0
// create the command that will actually launch the program and
// create any files needed for the launch like the batch script
static char* chpl_launch_create_command(int argc, char* argv[],
                                        int32_t numLocales) {
    int i;
    int size;
    char baseCommand[MAX_COM_LEN];
    char* command;
    FILE* slurmFile;
    char* account = getenv("CHPL_LAUNCHER_ACCOUNT");
    char* constraint = getenv("CHPL_LAUNCHER_CONSTRAINT");
    char* outputfn = getenv("CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME");
    char* basenamePtr = strrchr(argv[0], '/');
    pid_t mypid;

    // For programs with large amounts of output, a lot of time can be
    // spent syncing the stdout buffer to the output file. This can cause
    // tests to run extremely slow and can cause stdout and stderr to
    // become mixed in odd ways since stdout is buffered but stderr isn't.
    // To alleviate this problem (and to allow accurate external timings
    // of tests) this allows the output to be "buffered" to <tmpDir> and
    // copied once the job is done.
    //
    // Note that this should work even for multi-locale tests since all
    // the output is piped through a single node.
    //
    // The *NoFmt versions are the same as the regular version, except
    // that instead of using slurms output formatters, they use the
    // corresponding env var. e.g. you have to use '--output=%j.out to
    // have the output file be <jobid>.out, but when we copy the tmp file
    // to the real output file, the %j and other formatters aren't
    // available so we have to use the equivalent slurm env var
    // (SLURM_JOB_ID.) The env vars can't be used when specifying --output
    // because they haven't been initialized yet
    char* bufferStdout    = getenv("CHPL_LAUNCHER_SLURM_BUFFER_STDOUT");
    const char* tmpDir    = getTmpDir();
    char stdoutFile         [MAX_COM_LEN];
    char stdoutFileNoFmt    [MAX_COM_LEN];
    char tmpStdoutFile      [MAX_COM_LEN];
    char tmpStdoutFileNoFmt [MAX_COM_LEN];

    // command line walltime takes precedence over env var
    if (!walltime) {
        walltime = getenv("CHPL_LAUNCHER_WALLTIME");
    }

    // command line nodelist takes precedence over env var
    if (!nodelist) {
        nodelist = getenv("CHPL_LAUNCHER_NODELIST");
    }

    // command line partition takes precedence over env var
    if (!partition) {
        partition = getenv("CHPL_LAUNCHER_PARTITION");
    }

    // command line exclude takes precedence over env var
    if (!exclude) {
        exclude = getenv("CHPL_LAUNCHER_EXCLUDE");
    }

    if (basenamePtr == NULL) {
        basenamePtr = argv[0];
    } else {
        basenamePtr++;
    }

    chpl_compute_real_binary_name(argv[0]);

    if (debug) {
        mypid = 0;
    } else {
        mypid = getpid();
    }

    // Elliot, 12/02/14: TODO we have a bunch of similar commands to build up the
    // interactive and batch versions. It would be nicer to build up the commands
    // and postprocess depending on interactive vs batch. As in build up "--quiet
    // --nodes ..." and afterwards split on ' ' and then add #SBATCH and a
    // newline for batch mode and leave it as is for interactive"

    // if were running a batch job
    if (getenv("CHPL_LAUNCHER_USE_SBATCH") != NULL || generate_sbatch_script) {
        // set the sbatch filename
        sprintf(slurmFilename, "%s%d", baseSBATCHFilename, (int)mypid);

        // open the batch file and create the header
        slurmFile = fopen(slurmFilename, "w");
        fprintf(slurmFile, "#!/bin/sh\n\n");

        // set the job name
        fprintf(slurmFile, "#SBATCH --job-name=Chpl-%.10s\n", basenamePtr);

        // suppress informational messages, will still display errors
        fprintf(slurmFile, "#SBATCH --quiet\n");

        // request the number of locales, with 1 task per node, and number of cores
        // cpus-per-task. We probably don't need --nodes and --ntasks specified
        // since 1 task-per-node with n --tasks implies -n nodes
        fprintf(slurmFile, "#SBATCH --nodes=%d\n", numLocales);
        fprintf(slurmFile, "#SBATCH --ntasks=%d\n", numLocales);
        fprintf(slurmFile, "#SBATCH --ntasks-per-node=%d\n", procsPerNode);
        fprintf(slurmFile, "#SBATCH --cpus-per-task=%d\n", getCoresPerLocale());

        //request exclusive access to nodes
        fprintf(slurmFile, "#SBATCH --exclusive\n");

        // Set the walltime if it was specified
        if (walltime) {
            fprintf(slurmFile, "#SBATCH --time=%s\n", walltime);
        }

        // Set the nodelist if it was specified
        if (nodelist) {
            fprintf(slurmFile, "#SBATCH --nodelist=%s\n", nodelist);
        }

        // Set the partition if it was specified
        if (partition) {
            fprintf(slurmFile, "#SBATCH --partition=%s\n", partition);
        }

        // Set the exclude list if it was specified
        if (exclude) {
            fprintf(slurmFile, "#SBATCH --exclude=%s\n", exclude);
        }

        // If needed a constraint can be specified with the env var CHPL_LAUNCHER_CONSTRAINT
        if (constraint) {
            fprintf(slurmFile, "#SBATCH --constraint=%s\n", constraint);
        }

        // set the account name if one was provided
        if (account && strlen(account) > 0) {
            fprintf(slurmFile, "#SBATCH --account=%s\n", account);
        }

        // set the output file name to either the user specified
        // name or to the binaryName.<jobID>.out if none specified
        if (outputfn != NULL) {
            sprintf(stdoutFile,      "%s", outputfn);
            sprintf(stdoutFileNoFmt, "%s", outputfn);
        }
        else {
            sprintf(stdoutFile,      "%s.%s.out", argv[0], "%j");
            sprintf(stdoutFileNoFmt, "%s.%s.out", argv[0], "$SLURM_JOB_ID");
        }

        // We have slurm use the real output file to capture slurm errors/timeouts
        // We only redirect the program output to the tmp file
        fprintf(slurmFile, "#SBATCH --output=%s\n", stdoutFile);

        // If we're buffering the output, set the temp output file name.
        // It's always <tmpDir>/binaryName.<jobID>.out.
        if (bufferStdout != NULL) {
            sprintf(tmpStdoutFile,      "%s/%s.%s.out", tmpDir, argv[0], "%j");
            sprintf(tmpStdoutFileNoFmt, "%s/%s.%s.out", tmpDir, argv[0], "$SLURM_JOB_ID");
        }

        // add the srun command and the (possibly wrapped) binary name.
        fprintf(slurmFile, "srun --kill-on-bad-exit %s %s ",
                chpl_get_real_binary_wrapper(), chpl_get_real_binary_name());

        // add any arguments passed to the launcher to the binary
        for (i=1; i<argc; i++) {
            fprintf(slurmFile, "'%s' ", argv[i]);
        }

        // buffer program output to the tmp stdout file
        if (bufferStdout != NULL) {
            fprintf(slurmFile, "&> %s", tmpStdoutFileNoFmt);
        }
        fprintf(slurmFile, "\n");

        // After the job is run, if we buffered stdout to <tmpDir>, we need
        // to copy the output to the actual output file. The <tmpDir> output
        // will only exist on one node, ignore failures on the other nodes
        if (bufferStdout != NULL) {
            fprintf(slurmFile, "cat %s >> %s\n", tmpStdoutFileNoFmt, stdoutFileNoFmt);
            fprintf(slurmFile, "rm  %s &> /dev/null\n", tmpStdoutFileNoFmt);
        }

        // close the batch file and change permissions
        fclose(slurmFile);
        chmod(slurmFilename, 0755);

        if (generate_sbatch_script) {
            fprintf(stdout, "SBATCH script written to '%s'\n", slurmFilename);
        }

        // the baseCommand is what will call the batch file
        // that was just created
        sprintf(baseCommand, "sbatch %s\n", slurmFilename);
    }
    // else we're running an interactive job
    else {
        char iCom[1024];
        int len;

        len = 0;

        // set the job name
        len += sprintf(iCom+len, "--job-name=CHPL-%.10s ",basenamePtr);

        // suppress informational messages, will still display errors
        len += sprintf(iCom+len, "--quiet ");

        // request the number of locales, with 1 task per node, and number of cores
        // cpus-per-task. We probably don't need --nodes and --ntasks specified
        // since 1 task-per-node with n --tasks implies -n nodes
        len += sprintf(iCom+len, "--nodes=%d ",numLocales);
        len += sprintf(iCom+len, "--ntasks=%d ", numLocales);
        len += sprintf(iCom+len, "--ntasks-per-node=%d ", procsPerNode);
        len += sprintf(iCom+len, "--cpus-per-task=%d ", getCoresPerLocale());

        // request exclusive access
        len += sprintf(iCom+len, "--exclusive ");

        // kill the job if any program instance halts with non-zero exit status
        len += sprintf(iCom+len, "--kill-on-bad-exit ");

        // Set the walltime if it was specified
        if (walltime) {
            len += sprintf(iCom+len, "--time=%s ",walltime);
        }

        // Set the nodelist if it was specified
        if (nodelist) {
            len += sprintf(iCom+len, "--nodelist=%s ", nodelist);
        }

        // Set the partition if it was specified
        if (partition) {
            len += sprintf(iCom+len, "--partition=%s ", partition);
        }

        // Set the exclude list if it was specified
        if (exclude) {
            len += sprintf(iCom+len, "--exclude=%s ", exclude);
        }

        // set any constraints
        if (constraint) {
            len += sprintf(iCom+len, " --constraint=%s ", constraint);
        }

        // set the account name if one was provided
        if (account && strlen(account) > 0) {
            len += sprintf(iCom+len, "--account=%s ", account);
        }

        // add the (possibly wrapped) binary name
        len += sprintf(iCom+len, "%s %s ",
                       chpl_get_real_binary_wrapper(), chpl_get_real_binary_name());

        // add any arguments passed to the launcher to the binary
        for (i=1; i<argc; i++) {
            len += sprintf(iCom+len, "%s ", argv[i]);
        }

        // launch the job using srun
        sprintf(baseCommand, "srun %s ", iCom);
    }

    // copy baseCommand into command and return it
    size = strlen(baseCommand) + 1;
    command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, 0);
    sprintf(command, "%s", baseCommand);
    if (strlen(command)+1 > size) {
        chpl_internal_error("buffer overflow");
    }

    return command;
}