int getAprunArg(aprun_arg_t argt) { switch (argt) { case aprun_cc: return -1; // string arg/user provides this case aprun_n: return -1; // user provides this case aprun_d: return getCoresPerLocale(); case aprun_N: return getLocalesPerNode(); case aprun_j: return getCPUsPerCU(); default: return -1; } }
// create the command that will actually launch the program and // create any files needed for the launch like the batch script static char* chpl_launch_create_command(int argc, char* argv[], int32_t numLocales) { int i; int size; char baseCommand[256]; char* command; FILE* slurmFile, *expectFile; char* account = getenv("CHPL_LAUNCHER_ACCOUNT"); char* constraint = getenv("CHPL_LAUNCHER_CONSTRAINT"); char* outputfn = getenv("CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME"); char* basenamePtr = strrchr(argv[0], '/'); pid_t mypid; // command line walltime takes precedence over env var if (!walltime) { walltime = getenv("CHPL_LAUNCHER_WALLTIME"); } // command line nodelist takes precedence over env var if (!nodelist) { nodelist = getenv("CHPL_LAUNCHER_NODELIST"); } if (basenamePtr == NULL) { basenamePtr = argv[0]; } else { basenamePtr++; } chpl_compute_real_binary_name(argv[0]); if (debug) { mypid = 0; } else { mypid = getpid(); } // set the filenames sprintf(expectFilename, "%s%d", baseExpectFilename, (int)mypid); sprintf(slurmFilename, "%s%d", baseSBATCHFilename, (int)mypid); // if were running a batch job if (getenv("CHPL_LAUNCHER_USE_SBATCH") != NULL || generate_sbatch_script) { // open the batch file and create the header slurmFile = fopen(slurmFilename, "w"); fprintf(slurmFile, "#!/bin/sh\n\n"); // set the job name fprintf(slurmFile, "#SBATCH --job-name=Chpl-%.10s\n", basenamePtr); // suppress informational messages, will still display errors fprintf(slurmFile, "#SBATCH --quiet\n"); // request the number of locales, with 1 task per node, and number of cores // cpus-per-task. We probably don't need --nodes and --ntasks specified // since 1 task-per-node with n --tasks implies -n nodes fprintf(slurmFile, "#SBATCH --nodes=%d\n", numLocales); fprintf(slurmFile, "#SBATCH --ntasks=%d\n", numLocales); fprintf(slurmFile, "#SBATCH --ntasks-per-node=%d\n", procsPerNode); fprintf(slurmFile, "#SBATCH --cpus-per-task=%d\n", getCoresPerLocale()); //request exclusive access to nodes fprintf(slurmFile, "#SBATCH --exclusive\n"); // Set the walltime if it was specified if (walltime) { fprintf(slurmFile, "#SBATCH --time=%s\n", walltime); } // Set the nodelist if it was specified if (nodelist) { fprintf(slurmFile, "#SBATCH --nodelist=%s\n", nodelist); } // If needed a constraint can be specified with the env var CHPL_LAUNCHER_CONSTRAINT if (constraint) { fprintf(slurmFile, "#SBATCH --constraint=%s\n", constraint); } // set the account name if one was provided if (account && strlen(account) > 0) { fprintf(slurmFile, "#SBATCH --account=%s\n", account); } // set the output name to either the user specified // or to the binaryName.<jobID>.out if none specified if (outputfn!=NULL) { fprintf(slurmFile, "#SBATCH --output=%s\n", outputfn); } else { fprintf(slurmFile, "#SBATCH --output=%s.%%j.out\n", argv[0]); } // add the srun command fprintf(slurmFile, "srun %s ", chpl_get_real_binary_name()); // add any arguments passed to the launcher to the binary for (i=1; i<argc; i++) { fprintf(slurmFile, " '%s'", argv[i]); } fprintf(slurmFile, "\n"); // close the batch file and change permissions fclose(slurmFile); chmod(slurmFilename, 0755); if (generate_sbatch_script) { fprintf(stdout, "SBATCH script written to '%s'\n", slurmFilename); } // the baseCommand is what will call the batch file // that was just created sprintf(baseCommand, "sbatch %s\n", slurmFilename); } // else we're running an interactive job else { // expect is used to launch an interactive job // create the file and set some things for expect expectFile = fopen(expectFilename, "w"); fprintf(expectFile, "set timeout -1\n"); fprintf(expectFile, "set prompt \"(%%|#|\\\\$|>) $\"\n"); // create a silent salloc command fprintf(expectFile, "spawn -noecho srun "); // set the job name fprintf(expectFile, "--job-name=CHPL-%.10s ",basenamePtr); // suppress informational messages, will still display errors fprintf(expectFile, "--quiet "); // request the number of locales, with 1 task per node, and number of cores // cpus-per-task. We probably don't need --nodes and --ntasks specified // since 1 task-per-node with n --tasks implies -n nodes fprintf(expectFile, "--nodes=%d ",numLocales); fprintf(expectFile, "--ntasks=%d ", numLocales); fprintf(expectFile, "--ntasks-per-node=%d ", procsPerNode); fprintf(expectFile, "--cpus-per-task=%d ", getCoresPerLocale()); // request exclusive access fprintf(expectFile, "--exclusive "); // Set the walltime if i was specified if (walltime) { fprintf(expectFile, "--time=%s ", walltime); } // Set the walltime if it was specified if (nodelist) { fprintf(expectFile, "--nodelist=%s ", nodelist); } // set any constraints if (constraint) { fprintf(expectFile, " --constraint=%s ", constraint); } // set the account name if one was provided if (account && strlen(account) > 0) { fprintf(expectFile, "--account=%s ", account); } // the actual srun command fprintf(expectFile, "%s", chpl_get_real_binary_name()); // add any arguments passed to the launcher to the binary for (i=1; i<argc; i++) { fprintf(expectFile, " %s", argv[i]); } fprintf(expectFile, "\n\n"); // do some things required for expect and close the file fprintf(expectFile, "interact -o -re $prompt {return}\n"); fclose(expectFile); // the baseCommand is what will call the expect file sprintf(baseCommand, "expect %s", expectFilename); } // copy baseCommand into command and return it size = strlen(baseCommand) + 1; command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, ""); sprintf(command, "%s", baseCommand); if (strlen(command)+1 > size) { chpl_internal_error("buffer overflow"); } return command; }
// create the command that will actually launch the program and // create any files needed for the launch like the batch script static char* chpl_launch_create_command(int argc, char* argv[], int32_t numLocales) { int i; int size; char baseCommand[MAX_COM_LEN]; char* command; FILE* slurmFile; char* account = getenv("CHPL_LAUNCHER_ACCOUNT"); char* constraint = getenv("CHPL_LAUNCHER_CONSTRAINT"); char* outputfn = getenv("CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME"); char* basenamePtr = strrchr(argv[0], '/'); pid_t mypid; // For programs with large amounts of output, a lot of time can be // spent syncing the stdout buffer to the output file. This can cause // tests to run extremely slow and can cause stdout and stderr to // become mixed in odd ways since stdout is buffered but stderr isn't. // To alleviate this problem (and to allow accurate external timings // of tests) this allows the output to be "buffered" to <tmpDir> and // copied once the job is done. // // Note that this should work even for multi-locale tests since all // the output is piped through a single node. // // The *NoFmt versions are the same as the regular version, except // that instead of using slurms output formatters, they use the // corresponding env var. e.g. you have to use '--output=%j.out to // have the output file be <jobid>.out, but when we copy the tmp file // to the real output file, the %j and other formatters aren't // available so we have to use the equivalent slurm env var // (SLURM_JOB_ID.) The env vars can't be used when specifying --output // because they haven't been initialized yet char* bufferStdout = getenv("CHPL_LAUNCHER_SLURM_BUFFER_STDOUT"); const char* tmpDir = getTmpDir(); char stdoutFile [MAX_COM_LEN]; char stdoutFileNoFmt [MAX_COM_LEN]; char tmpStdoutFile [MAX_COM_LEN]; char tmpStdoutFileNoFmt [MAX_COM_LEN]; // command line walltime takes precedence over env var if (!walltime) { walltime = getenv("CHPL_LAUNCHER_WALLTIME"); } // command line nodelist takes precedence over env var if (!nodelist) { nodelist = getenv("CHPL_LAUNCHER_NODELIST"); } // command line partition takes precedence over env var if (!partition) { partition = getenv("CHPL_LAUNCHER_PARTITION"); } // command line exclude takes precedence over env var if (!exclude) { exclude = getenv("CHPL_LAUNCHER_EXCLUDE"); } if (basenamePtr == NULL) { basenamePtr = argv[0]; } else { basenamePtr++; } chpl_compute_real_binary_name(argv[0]); if (debug) { mypid = 0; } else { mypid = getpid(); } // Elliot, 12/02/14: TODO we have a bunch of similar commands to build up the // interactive and batch versions. It would be nicer to build up the commands // and postprocess depending on interactive vs batch. As in build up "--quiet // --nodes ..." and afterwards split on ' ' and then add #SBATCH and a // newline for batch mode and leave it as is for interactive" // if were running a batch job if (getenv("CHPL_LAUNCHER_USE_SBATCH") != NULL || generate_sbatch_script) { // set the sbatch filename sprintf(slurmFilename, "%s%d", baseSBATCHFilename, (int)mypid); // open the batch file and create the header slurmFile = fopen(slurmFilename, "w"); fprintf(slurmFile, "#!/bin/sh\n\n"); // set the job name fprintf(slurmFile, "#SBATCH --job-name=Chpl-%.10s\n", basenamePtr); // suppress informational messages, will still display errors fprintf(slurmFile, "#SBATCH --quiet\n"); // request the number of locales, with 1 task per node, and number of cores // cpus-per-task. We probably don't need --nodes and --ntasks specified // since 1 task-per-node with n --tasks implies -n nodes fprintf(slurmFile, "#SBATCH --nodes=%d\n", numLocales); fprintf(slurmFile, "#SBATCH --ntasks=%d\n", numLocales); fprintf(slurmFile, "#SBATCH --ntasks-per-node=%d\n", procsPerNode); fprintf(slurmFile, "#SBATCH --cpus-per-task=%d\n", getCoresPerLocale()); //request exclusive access to nodes fprintf(slurmFile, "#SBATCH --exclusive\n"); // Set the walltime if it was specified if (walltime) { fprintf(slurmFile, "#SBATCH --time=%s\n", walltime); } // Set the nodelist if it was specified if (nodelist) { fprintf(slurmFile, "#SBATCH --nodelist=%s\n", nodelist); } // Set the partition if it was specified if (partition) { fprintf(slurmFile, "#SBATCH --partition=%s\n", partition); } // Set the exclude list if it was specified if (exclude) { fprintf(slurmFile, "#SBATCH --exclude=%s\n", exclude); } // If needed a constraint can be specified with the env var CHPL_LAUNCHER_CONSTRAINT if (constraint) { fprintf(slurmFile, "#SBATCH --constraint=%s\n", constraint); } // set the account name if one was provided if (account && strlen(account) > 0) { fprintf(slurmFile, "#SBATCH --account=%s\n", account); } // set the output file name to either the user specified // name or to the binaryName.<jobID>.out if none specified if (outputfn != NULL) { sprintf(stdoutFile, "%s", outputfn); sprintf(stdoutFileNoFmt, "%s", outputfn); } else { sprintf(stdoutFile, "%s.%s.out", argv[0], "%j"); sprintf(stdoutFileNoFmt, "%s.%s.out", argv[0], "$SLURM_JOB_ID"); } // We have slurm use the real output file to capture slurm errors/timeouts // We only redirect the program output to the tmp file fprintf(slurmFile, "#SBATCH --output=%s\n", stdoutFile); // If we're buffering the output, set the temp output file name. // It's always <tmpDir>/binaryName.<jobID>.out. if (bufferStdout != NULL) { sprintf(tmpStdoutFile, "%s/%s.%s.out", tmpDir, argv[0], "%j"); sprintf(tmpStdoutFileNoFmt, "%s/%s.%s.out", tmpDir, argv[0], "$SLURM_JOB_ID"); } // add the srun command and the (possibly wrapped) binary name. fprintf(slurmFile, "srun --kill-on-bad-exit %s %s ", chpl_get_real_binary_wrapper(), chpl_get_real_binary_name()); // add any arguments passed to the launcher to the binary for (i=1; i<argc; i++) { fprintf(slurmFile, "'%s' ", argv[i]); } // buffer program output to the tmp stdout file if (bufferStdout != NULL) { fprintf(slurmFile, "&> %s", tmpStdoutFileNoFmt); } fprintf(slurmFile, "\n"); // After the job is run, if we buffered stdout to <tmpDir>, we need // to copy the output to the actual output file. The <tmpDir> output // will only exist on one node, ignore failures on the other nodes if (bufferStdout != NULL) { fprintf(slurmFile, "cat %s >> %s\n", tmpStdoutFileNoFmt, stdoutFileNoFmt); fprintf(slurmFile, "rm %s &> /dev/null\n", tmpStdoutFileNoFmt); } // close the batch file and change permissions fclose(slurmFile); chmod(slurmFilename, 0755); if (generate_sbatch_script) { fprintf(stdout, "SBATCH script written to '%s'\n", slurmFilename); } // the baseCommand is what will call the batch file // that was just created sprintf(baseCommand, "sbatch %s\n", slurmFilename); } // else we're running an interactive job else { char iCom[1024]; int len; len = 0; // set the job name len += sprintf(iCom+len, "--job-name=CHPL-%.10s ",basenamePtr); // suppress informational messages, will still display errors len += sprintf(iCom+len, "--quiet "); // request the number of locales, with 1 task per node, and number of cores // cpus-per-task. We probably don't need --nodes and --ntasks specified // since 1 task-per-node with n --tasks implies -n nodes len += sprintf(iCom+len, "--nodes=%d ",numLocales); len += sprintf(iCom+len, "--ntasks=%d ", numLocales); len += sprintf(iCom+len, "--ntasks-per-node=%d ", procsPerNode); len += sprintf(iCom+len, "--cpus-per-task=%d ", getCoresPerLocale()); // request exclusive access len += sprintf(iCom+len, "--exclusive "); // kill the job if any program instance halts with non-zero exit status len += sprintf(iCom+len, "--kill-on-bad-exit "); // Set the walltime if it was specified if (walltime) { len += sprintf(iCom+len, "--time=%s ",walltime); } // Set the nodelist if it was specified if (nodelist) { len += sprintf(iCom+len, "--nodelist=%s ", nodelist); } // Set the partition if it was specified if (partition) { len += sprintf(iCom+len, "--partition=%s ", partition); } // Set the exclude list if it was specified if (exclude) { len += sprintf(iCom+len, "--exclude=%s ", exclude); } // set any constraints if (constraint) { len += sprintf(iCom+len, " --constraint=%s ", constraint); } // set the account name if one was provided if (account && strlen(account) > 0) { len += sprintf(iCom+len, "--account=%s ", account); } // add the (possibly wrapped) binary name len += sprintf(iCom+len, "%s %s ", chpl_get_real_binary_wrapper(), chpl_get_real_binary_name()); // add any arguments passed to the launcher to the binary for (i=1; i<argc; i++) { len += sprintf(iCom+len, "%s ", argv[i]); } // launch the job using srun sprintf(baseCommand, "srun %s ", iCom); } // copy baseCommand into command and return it size = strlen(baseCommand) + 1; command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, 0); sprintf(command, "%s", baseCommand); if (strlen(command)+1 > size) { chpl_internal_error("buffer overflow"); } return command; }