static void chpl_launch_sanity_checks(const char* argv0) { // Do sanity checks just before launching. struct stat statBuf; // Make sure the _real binary exists // (this should be called after someone has called // chpl_compute_real_binary_name() ) if (stat(chpl_get_real_binary_name(), &statBuf) != 0) { char errorMsg[256]; sprintf(errorMsg, "unable to locate file: %s", chpl_get_real_binary_name()); chpl_error(errorMsg, 0, 0); } }
static char* chpl_launch_create_command(int argc, char* argv[], int32_t numLocales) { int i; int size; char baseCommand[256]; char* command; if (numLocales != 1) { chpl_error("dummy launcher only supports numLocales==1", 0, "<command-line>"); } chpl_compute_real_binary_name(argv[0]); sprintf(baseCommand, "%s", chpl_get_real_binary_name()); size = strlen(baseCommand) + 1; for (i=1; i<argc; i++) { size += strlen(argv[i]) + 3; } command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, ""); sprintf(command, "%s", baseCommand); for (i=1; i<argc; i++) { strcat(command, " '"); strcat(command, argv[i]); strcat(command, "'"); } if (strlen(command)+1 > size) { chpl_internal_error("buffer overflow"); } return command; }
static char* chpl_launch_create_command(int argc, char* argv[], int32_t numLocales) { int i; int size; char baseCommand[256]; char* command; chpl_compute_real_binary_name(argv[0]); sprintf(baseCommand, "mpirun -np %d %s %s", numLocales, MPIRUN_XTRA_OPTS, chpl_get_real_binary_name()); size = strlen(MPIRUN_PATH) + 1 + strlen(baseCommand) + 1; for (i=1; i<argc; i++) { size += strlen(argv[i]) + 3; } command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, ""); sprintf(command, "%s/%s", MPIRUN_PATH, baseCommand); for (i=1; i<argc; i++) { strcat(command, " '"); strcat(command, argv[i]); strcat(command, "'"); } if (strlen(command)+1 > size) { chpl_internal_error("buffer overflow"); } return command; }
static char* chpl_launch_create_command(int argc, char* argv[], int32_t numLocales) { int i; int size; char baseCommand[256]; char* command; FILE* llFile, *expectFile; char* projectString = getenv(launcherAccountEnvvar); char* basenamePtr = strrchr(argv[0], '/'); pid_t mypid; if (basenamePtr == NULL) { basenamePtr = argv[0]; } else { basenamePtr++; } chpl_compute_real_binary_name(argv[0]); #ifndef DEBUG_LAUNCH mypid = getpid(); #else mypid = 0; #endif sprintf(expectFilename, "%s%d", baseExpectFilename, (int)mypid); sprintf(llFilename, "%s%d", baseLLFilename, (int)mypid); llFile = fopen(llFilename, "w"); fprintf(llFile, "# @ wall_clock_limit = 00:10:00\n"); fprintf(llFile, "# @ job_type = parallel\n"); fprintf(llFile, "# @ node = %d\n", numLocales); fprintf(llFile, "# @ tasks_per_node = 1\n"); if (projectString && strlen(projectString) > 0) fprintf(llFile, "# @ class = %s\n", projectString); fprintf(llFile, "# @ output = out.$(jobid)\n"); fprintf(llFile, "# @ error = err.$(jobid)\n"); fprintf(llFile, "# @ queue\n"); fprintf(llFile, "\n"); fprintf(llFile, "%s", chpl_get_real_binary_name()); for (i=1; i<argc; i++) { fprintf(llFile, " '%s'", argv[i]); } fprintf(llFile, "\n"); fclose(llFile); sprintf(baseCommand, "llsubmit %s", llFilename); size = strlen(baseCommand) + 1; command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, ""); sprintf(command, "%s", baseCommand); if (strlen(command)+1 > size) { chpl_internal_error("buffer overflow"); } return command; }
static void chpl_launch_sanity_checks(const char* argv0) { // Do sanity checks just before launching. struct stat statBuf; // Make sure the _real binary exists // (this should be called after someone has called // chpl_compute_real_binary_name() ) if (stat(chpl_get_real_binary_name(), &statBuf) != 0) { char errorMsg[256]; int wanted_to_write = snprintf(errorMsg, sizeof(errorMsg), "unable to locate file: %s", chpl_get_real_binary_name()); if (wanted_to_write < 0) { const char fallbackMsg[] = "character encoding error in name of executable to be launched"; strcpy(errorMsg, fallbackMsg); } else if ((size_t)wanted_to_write >= sizeof(errorMsg)) { strcpy(&errorMsg[sizeof(errorMsg) - 4], "..."); } chpl_error(errorMsg, 0, 0); } }
// create the command that will actually launch the program and // create any files needed for the launch like the batch script static char* chpl_launch_create_command(int argc, char* argv[], int32_t numLocales) { int i; int size; char baseCommand[256]; char* command; FILE* slurmFile, *expectFile; char* account = getenv("CHPL_LAUNCHER_ACCOUNT"); char* constraint = getenv("CHPL_LAUNCHER_CONSTRAINT"); char* outputfn = getenv("CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME"); char* basenamePtr = strrchr(argv[0], '/'); pid_t mypid; // command line walltime takes precedence over env var if (!walltime) { walltime = getenv("CHPL_LAUNCHER_WALLTIME"); } // command line nodelist takes precedence over env var if (!nodelist) { nodelist = getenv("CHPL_LAUNCHER_NODELIST"); } if (basenamePtr == NULL) { basenamePtr = argv[0]; } else { basenamePtr++; } chpl_compute_real_binary_name(argv[0]); if (debug) { mypid = 0; } else { mypid = getpid(); } // set the filenames sprintf(expectFilename, "%s%d", baseExpectFilename, (int)mypid); sprintf(slurmFilename, "%s%d", baseSBATCHFilename, (int)mypid); // if were running a batch job if (getenv("CHPL_LAUNCHER_USE_SBATCH") != NULL || generate_sbatch_script) { // open the batch file and create the header slurmFile = fopen(slurmFilename, "w"); fprintf(slurmFile, "#!/bin/sh\n\n"); // set the job name fprintf(slurmFile, "#SBATCH --job-name=Chpl-%.10s\n", basenamePtr); // suppress informational messages, will still display errors fprintf(slurmFile, "#SBATCH --quiet\n"); // request the number of locales, with 1 task per node, and number of cores // cpus-per-task. We probably don't need --nodes and --ntasks specified // since 1 task-per-node with n --tasks implies -n nodes fprintf(slurmFile, "#SBATCH --nodes=%d\n", numLocales); fprintf(slurmFile, "#SBATCH --ntasks=%d\n", numLocales); fprintf(slurmFile, "#SBATCH --ntasks-per-node=%d\n", procsPerNode); fprintf(slurmFile, "#SBATCH --cpus-per-task=%d\n", getCoresPerLocale()); //request exclusive access to nodes fprintf(slurmFile, "#SBATCH --exclusive\n"); // Set the walltime if it was specified if (walltime) { fprintf(slurmFile, "#SBATCH --time=%s\n", walltime); } // Set the nodelist if it was specified if (nodelist) { fprintf(slurmFile, "#SBATCH --nodelist=%s\n", nodelist); } // If needed a constraint can be specified with the env var CHPL_LAUNCHER_CONSTRAINT if (constraint) { fprintf(slurmFile, "#SBATCH --constraint=%s\n", constraint); } // set the account name if one was provided if (account && strlen(account) > 0) { fprintf(slurmFile, "#SBATCH --account=%s\n", account); } // set the output name to either the user specified // or to the binaryName.<jobID>.out if none specified if (outputfn!=NULL) { fprintf(slurmFile, "#SBATCH --output=%s\n", outputfn); } else { fprintf(slurmFile, "#SBATCH --output=%s.%%j.out\n", argv[0]); } // add the srun command fprintf(slurmFile, "srun %s ", chpl_get_real_binary_name()); // add any arguments passed to the launcher to the binary for (i=1; i<argc; i++) { fprintf(slurmFile, " '%s'", argv[i]); } fprintf(slurmFile, "\n"); // close the batch file and change permissions fclose(slurmFile); chmod(slurmFilename, 0755); if (generate_sbatch_script) { fprintf(stdout, "SBATCH script written to '%s'\n", slurmFilename); } // the baseCommand is what will call the batch file // that was just created sprintf(baseCommand, "sbatch %s\n", slurmFilename); } // else we're running an interactive job else { // expect is used to launch an interactive job // create the file and set some things for expect expectFile = fopen(expectFilename, "w"); fprintf(expectFile, "set timeout -1\n"); fprintf(expectFile, "set prompt \"(%%|#|\\\\$|>) $\"\n"); // create a silent salloc command fprintf(expectFile, "spawn -noecho srun "); // set the job name fprintf(expectFile, "--job-name=CHPL-%.10s ",basenamePtr); // suppress informational messages, will still display errors fprintf(expectFile, "--quiet "); // request the number of locales, with 1 task per node, and number of cores // cpus-per-task. We probably don't need --nodes and --ntasks specified // since 1 task-per-node with n --tasks implies -n nodes fprintf(expectFile, "--nodes=%d ",numLocales); fprintf(expectFile, "--ntasks=%d ", numLocales); fprintf(expectFile, "--ntasks-per-node=%d ", procsPerNode); fprintf(expectFile, "--cpus-per-task=%d ", getCoresPerLocale()); // request exclusive access fprintf(expectFile, "--exclusive "); // Set the walltime if i was specified if (walltime) { fprintf(expectFile, "--time=%s ", walltime); } // Set the walltime if it was specified if (nodelist) { fprintf(expectFile, "--nodelist=%s ", nodelist); } // set any constraints if (constraint) { fprintf(expectFile, " --constraint=%s ", constraint); } // set the account name if one was provided if (account && strlen(account) > 0) { fprintf(expectFile, "--account=%s ", account); } // the actual srun command fprintf(expectFile, "%s", chpl_get_real_binary_name()); // add any arguments passed to the launcher to the binary for (i=1; i<argc; i++) { fprintf(expectFile, " %s", argv[i]); } fprintf(expectFile, "\n\n"); // do some things required for expect and close the file fprintf(expectFile, "interact -o -re $prompt {return}\n"); fclose(expectFile); // the baseCommand is what will call the expect file sprintf(baseCommand, "expect %s", expectFilename); } // copy baseCommand into command and return it size = strlen(baseCommand) + 1; command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, ""); sprintf(command, "%s", baseCommand); if (strlen(command)+1 > size) { chpl_internal_error("buffer overflow"); } return command; }
static char* chpl_launch_create_command(int argc, char* argv[], int32_t numLocales) { int i; int size; char baseCommand[256]; char* command; FILE* slurmFile, *expectFile; char* projectString = getenv(launcherAccountEnvvar); char* constraint = getenv("CHPL_LAUNCHER_CONSTRAINT"); char* walltime = getenv("CHPL_LAUNCHER_WALLTIME"); char* outputfn = getenv("CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME"); char* basenamePtr = strrchr(argv[0], '/'); pid_t mypid; if (basenamePtr == NULL) { basenamePtr = argv[0]; } else { basenamePtr++; } chpl_compute_real_binary_name(argv[0]); #ifndef DEBUG_LAUNCH mypid = getpid(); #else mypid = 0; #endif sprintf(sysFilename, "%s%d", baseSysFilename, (int)mypid); sprintf(expectFilename, "%s%d", baseExpectFilename, (int)mypid); sprintf(slurmFilename, "%s%d", baseSBATCHFilename, (int)mypid); if (getenv("CHPL_LAUNCHER_USE_SBATCH") != NULL) { slurmFile = fopen(slurmFilename, "w"); fprintf(slurmFile, "#!/bin/sh\n\n"); fprintf(slurmFile, "#SBATCH -J Chpl-%.10s\n", basenamePtr); genNumLocalesOptions(slurmFile, determineQsubVersion(), numLocales, getNumCoresPerLocale()); if (projectString && strlen(projectString) > 0) fprintf(slurmFile, "#SBATCH -A %s\n", projectString); if (getenv("CHPL_LAUNCHER_USE_SBATCH") != NULL) { // fprintf(slurmFile, "#SBATCH -joe\n"); if (outputfn!=NULL) fprintf(slurmFile, "#SBATCH -o %s.%%j.out\n", outputfn); else fprintf(slurmFile, "#SBATCH -o %s.%%j.out\n", argv[0]); // fprintf(slurmFile, "cd $SBATCH_O_WORKDIR\n"); fprintf(slurmFile, "%s/gasnetrun_ibv -n %d %s ", WRAP_TO_STR(LAUNCH_PATH), numLocales, chpl_get_real_binary_name()); for (i=1; i<argc; i++) { fprintf(slurmFile, " '%s'", argv[i]); } fprintf(slurmFile, "\n"); } fclose(slurmFile); chmod( slurmFilename, 0755); } if (getenv("CHPL_LAUNCHER_USE_SBATCH") == NULL) { expectFile = fopen(expectFilename, "w"); if (verbosity < 2) { // fprintf(expectFile, "log_user 0\n"); } fprintf(expectFile, "set timeout -1\n"); // fprintf(expectFile, "chmod +x %s\n",slurmFilename); fprintf(expectFile, "set prompt \"(%%|#|\\\\$|>) $\"\n"); // fprintf(expectFile, "spawn sbatch "); fprintf(expectFile, "spawn -noecho salloc "); fprintf(expectFile, "-J %.10s ",basenamePtr); // pass fprintf(expectFile, "-N %d ",numLocales); fprintf(expectFile, "--ntasks-per-node=1 ",numLocales); fprintf(expectFile, "--exclusive "); // give exclusive access to the nodes fprintf(expectFile, "--time=%s ",walltime); if (constraint) { fprintf(expectFile, " -C %s", constraint); } // fprintf(expectFile, "-I %s ", slurmFilename); fprintf(expectFile, " %s/gasnetrun_ibv -n %d %s ", WRAP_TO_STR(LAUNCH_PATH), numLocales, chpl_get_real_binary_name()); for (i=1; i<argc; i++) { fprintf(expectFile, " %s", argv[i]); } // fprintf(expectFile, "\\n\"\n"); fprintf(expectFile, "\n\n"); // fprintf(expectFile, "expect -re $prompt\n"); // fprintf(expectFile, "send \"cd \\$SBATCH_O_WORKDIR\\n\"\n"); // fprintf(expectFile, "expect -re $prompt\n"); // fprintf(expectFile, "sleep 10\n"); // fprintf(expectFile, "interact -o -re $prompt {return}\n"); // fprintf(expectFile, "send_user \"\\n\"\n"); // fprintf(expectFile, "send \"exit\\n\"\n"); fprintf(expectFile, "interact -o -re $prompt {return}\n"); fclose(expectFile); sprintf(baseCommand, "expect %s", expectFilename); } else { // sprintf(baseCommand, "sbatch %s\n", slurmFilename); sprintf(baseCommand, "sbatch %s\n", slurmFilename); } size = strlen(baseCommand) + 1; command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, ""); sprintf(command, "%s", baseCommand); if (strlen(command)+1 > size) { chpl_internal_error("buffer overflow"); } return command; }
static char* chpl_launch_create_command(int argc, char* argv[], int32_t numLocales) { int i; int size; char baseCommand[256]; char* command; FILE* pbsFile, *expectFile; char* projectString = getenv(launcherAccountEnvvar); char* basenamePtr = strrchr(argv[0], '/'); pid_t mypid; if (basenamePtr == NULL) { basenamePtr = argv[0]; } else { basenamePtr++; } chpl_compute_real_binary_name(argv[0]); #ifndef DEBUG_LAUNCH mypid = getpid(); #else mypid = 0; #endif sprintf(sysFilename, "%s%d", baseSysFilename, (int)mypid); sprintf(expectFilename, "%s%d", baseExpectFilename, (int)mypid); sprintf(pbsFilename, "%s%d", basePBSFilename, (int)mypid); pbsFile = fopen(pbsFilename, "w"); fprintf(pbsFile, "#!/bin/sh\n\n"); fprintf(pbsFile, "#PBS -N Chpl-%.10s\n", basenamePtr); genNumLocalesOptions(pbsFile, determineQsubVersion(), numLocales, getNumCoresPerLocale()); if (projectString && strlen(projectString) > 0) fprintf(pbsFile, "#PBS -A %s\n", projectString); fclose(pbsFile); expectFile = fopen(expectFilename, "w"); if (verbosity < 2) { fprintf(expectFile, "log_user 0\n"); } fprintf(expectFile, "set timeout -1\n"); fprintf(expectFile, "set prompt \"(%%|#|\\\\$|>) $\"\n"); fprintf(expectFile, "spawn qsub -z "); fprintf(expectFile, "-V "); // pass through all environment variables fprintf(expectFile, "-I %s\n", pbsFilename); fprintf(expectFile, "expect -re $prompt\n"); fprintf(expectFile, "send \"cd \\$PBS_O_WORKDIR\\n\"\n"); fprintf(expectFile, "expect -re $prompt\n"); fprintf(expectFile, "send \"%s/%s/gasnetrun_ibv -n %d -N %d", CHPL_THIRD_PARTY, WRAP_TO_STR(LAUNCH_PATH), numLocales, numLocales); propagate_charset_environment(expectFile); fprintf(expectFile, " %s ", chpl_get_real_binary_name()); for (i=1; i<argc; i++) { fprintf(expectFile, " '%s'", argv[i]); } fprintf(expectFile, "\\n\"\n"); fprintf(expectFile, "interact -o -re $prompt {return}\n"); fprintf(expectFile, "send_user \"\\n\"\n"); fprintf(expectFile, "send \"exit\\n\"\n"); fclose(expectFile); sprintf(baseCommand, "expect %s", expectFilename); size = strlen(baseCommand) + 1; command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, 0); sprintf(command, "%s", baseCommand); if (strlen(command)+1 > size) { chpl_internal_error("buffer overflow"); } return command; }
// create the command that will actually launch the program and // create any files needed for the launch like the batch script static char* chpl_launch_create_command(int argc, char* argv[], int32_t numLocales) { int i; int size; char baseCommand[MAX_COM_LEN]; char* command; FILE* slurmFile; char* account = getenv("CHPL_LAUNCHER_ACCOUNT"); char* constraint = getenv("CHPL_LAUNCHER_CONSTRAINT"); char* outputfn = getenv("CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME"); char* basenamePtr = strrchr(argv[0], '/'); pid_t mypid; // For programs with large amounts of output, a lot of time can be // spent syncing the stdout buffer to the output file. This can cause // tests to run extremely slow and can cause stdout and stderr to // become mixed in odd ways since stdout is buffered but stderr isn't. // To alleviate this problem (and to allow accurate external timings // of tests) this allows the output to be "buffered" to <tmpDir> and // copied once the job is done. // // Note that this should work even for multi-locale tests since all // the output is piped through a single node. // // The *NoFmt versions are the same as the regular version, except // that instead of using slurms output formatters, they use the // corresponding env var. e.g. you have to use '--output=%j.out to // have the output file be <jobid>.out, but when we copy the tmp file // to the real output file, the %j and other formatters aren't // available so we have to use the equivalent slurm env var // (SLURM_JOB_ID.) The env vars can't be used when specifying --output // because they haven't been initialized yet char* bufferStdout = getenv("CHPL_LAUNCHER_SLURM_BUFFER_STDOUT"); const char* tmpDir = getTmpDir(); char stdoutFile [MAX_COM_LEN]; char stdoutFileNoFmt [MAX_COM_LEN]; char tmpStdoutFile [MAX_COM_LEN]; char tmpStdoutFileNoFmt [MAX_COM_LEN]; // command line walltime takes precedence over env var if (!walltime) { walltime = getenv("CHPL_LAUNCHER_WALLTIME"); } // command line nodelist takes precedence over env var if (!nodelist) { nodelist = getenv("CHPL_LAUNCHER_NODELIST"); } // command line partition takes precedence over env var if (!partition) { partition = getenv("CHPL_LAUNCHER_PARTITION"); } // command line exclude takes precedence over env var if (!exclude) { exclude = getenv("CHPL_LAUNCHER_EXCLUDE"); } if (basenamePtr == NULL) { basenamePtr = argv[0]; } else { basenamePtr++; } chpl_compute_real_binary_name(argv[0]); if (debug) { mypid = 0; } else { mypid = getpid(); } // Elliot, 12/02/14: TODO we have a bunch of similar commands to build up the // interactive and batch versions. It would be nicer to build up the commands // and postprocess depending on interactive vs batch. As in build up "--quiet // --nodes ..." and afterwards split on ' ' and then add #SBATCH and a // newline for batch mode and leave it as is for interactive" // if were running a batch job if (getenv("CHPL_LAUNCHER_USE_SBATCH") != NULL || generate_sbatch_script) { // set the sbatch filename sprintf(slurmFilename, "%s%d", baseSBATCHFilename, (int)mypid); // open the batch file and create the header slurmFile = fopen(slurmFilename, "w"); fprintf(slurmFile, "#!/bin/sh\n\n"); // set the job name fprintf(slurmFile, "#SBATCH --job-name=Chpl-%.10s\n", basenamePtr); // suppress informational messages, will still display errors fprintf(slurmFile, "#SBATCH --quiet\n"); // request the number of locales, with 1 task per node, and number of cores // cpus-per-task. We probably don't need --nodes and --ntasks specified // since 1 task-per-node with n --tasks implies -n nodes fprintf(slurmFile, "#SBATCH --nodes=%d\n", numLocales); fprintf(slurmFile, "#SBATCH --ntasks=%d\n", numLocales); fprintf(slurmFile, "#SBATCH --ntasks-per-node=%d\n", procsPerNode); fprintf(slurmFile, "#SBATCH --cpus-per-task=%d\n", getCoresPerLocale()); //request exclusive access to nodes fprintf(slurmFile, "#SBATCH --exclusive\n"); // Set the walltime if it was specified if (walltime) { fprintf(slurmFile, "#SBATCH --time=%s\n", walltime); } // Set the nodelist if it was specified if (nodelist) { fprintf(slurmFile, "#SBATCH --nodelist=%s\n", nodelist); } // Set the partition if it was specified if (partition) { fprintf(slurmFile, "#SBATCH --partition=%s\n", partition); } // Set the exclude list if it was specified if (exclude) { fprintf(slurmFile, "#SBATCH --exclude=%s\n", exclude); } // If needed a constraint can be specified with the env var CHPL_LAUNCHER_CONSTRAINT if (constraint) { fprintf(slurmFile, "#SBATCH --constraint=%s\n", constraint); } // set the account name if one was provided if (account && strlen(account) > 0) { fprintf(slurmFile, "#SBATCH --account=%s\n", account); } // set the output file name to either the user specified // name or to the binaryName.<jobID>.out if none specified if (outputfn != NULL) { sprintf(stdoutFile, "%s", outputfn); sprintf(stdoutFileNoFmt, "%s", outputfn); } else { sprintf(stdoutFile, "%s.%s.out", argv[0], "%j"); sprintf(stdoutFileNoFmt, "%s.%s.out", argv[0], "$SLURM_JOB_ID"); } // We have slurm use the real output file to capture slurm errors/timeouts // We only redirect the program output to the tmp file fprintf(slurmFile, "#SBATCH --output=%s\n", stdoutFile); // If we're buffering the output, set the temp output file name. // It's always <tmpDir>/binaryName.<jobID>.out. if (bufferStdout != NULL) { sprintf(tmpStdoutFile, "%s/%s.%s.out", tmpDir, argv[0], "%j"); sprintf(tmpStdoutFileNoFmt, "%s/%s.%s.out", tmpDir, argv[0], "$SLURM_JOB_ID"); } // add the srun command and the (possibly wrapped) binary name. fprintf(slurmFile, "srun --kill-on-bad-exit %s %s ", chpl_get_real_binary_wrapper(), chpl_get_real_binary_name()); // add any arguments passed to the launcher to the binary for (i=1; i<argc; i++) { fprintf(slurmFile, "'%s' ", argv[i]); } // buffer program output to the tmp stdout file if (bufferStdout != NULL) { fprintf(slurmFile, "&> %s", tmpStdoutFileNoFmt); } fprintf(slurmFile, "\n"); // After the job is run, if we buffered stdout to <tmpDir>, we need // to copy the output to the actual output file. The <tmpDir> output // will only exist on one node, ignore failures on the other nodes if (bufferStdout != NULL) { fprintf(slurmFile, "cat %s >> %s\n", tmpStdoutFileNoFmt, stdoutFileNoFmt); fprintf(slurmFile, "rm %s &> /dev/null\n", tmpStdoutFileNoFmt); } // close the batch file and change permissions fclose(slurmFile); chmod(slurmFilename, 0755); if (generate_sbatch_script) { fprintf(stdout, "SBATCH script written to '%s'\n", slurmFilename); } // the baseCommand is what will call the batch file // that was just created sprintf(baseCommand, "sbatch %s\n", slurmFilename); } // else we're running an interactive job else { char iCom[1024]; int len; len = 0; // set the job name len += sprintf(iCom+len, "--job-name=CHPL-%.10s ",basenamePtr); // suppress informational messages, will still display errors len += sprintf(iCom+len, "--quiet "); // request the number of locales, with 1 task per node, and number of cores // cpus-per-task. We probably don't need --nodes and --ntasks specified // since 1 task-per-node with n --tasks implies -n nodes len += sprintf(iCom+len, "--nodes=%d ",numLocales); len += sprintf(iCom+len, "--ntasks=%d ", numLocales); len += sprintf(iCom+len, "--ntasks-per-node=%d ", procsPerNode); len += sprintf(iCom+len, "--cpus-per-task=%d ", getCoresPerLocale()); // request exclusive access len += sprintf(iCom+len, "--exclusive "); // kill the job if any program instance halts with non-zero exit status len += sprintf(iCom+len, "--kill-on-bad-exit "); // Set the walltime if it was specified if (walltime) { len += sprintf(iCom+len, "--time=%s ",walltime); } // Set the nodelist if it was specified if (nodelist) { len += sprintf(iCom+len, "--nodelist=%s ", nodelist); } // Set the partition if it was specified if (partition) { len += sprintf(iCom+len, "--partition=%s ", partition); } // Set the exclude list if it was specified if (exclude) { len += sprintf(iCom+len, "--exclude=%s ", exclude); } // set any constraints if (constraint) { len += sprintf(iCom+len, " --constraint=%s ", constraint); } // set the account name if one was provided if (account && strlen(account) > 0) { len += sprintf(iCom+len, "--account=%s ", account); } // add the (possibly wrapped) binary name len += sprintf(iCom+len, "%s %s ", chpl_get_real_binary_wrapper(), chpl_get_real_binary_name()); // add any arguments passed to the launcher to the binary for (i=1; i<argc; i++) { len += sprintf(iCom+len, "%s ", argv[i]); } // launch the job using srun sprintf(baseCommand, "srun %s ", iCom); } // copy baseCommand into command and return it size = strlen(baseCommand) + 1; command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, 0); sprintf(command, "%s", baseCommand); if (strlen(command)+1 > size) { chpl_internal_error("buffer overflow"); } return command; }
static char* chpl_launch_create_command(int argc, char* argv[], int32_t numLocales) { int i; int size; char baseCommand[2*FILENAME_MAX]; char* command; FILE* slurmFile, *expectFile; char* projectString = getenv(launcherAccountEnvvar); char* constraint = getenv("CHPL_LAUNCHER_CONSTRAINT"); char* outputfn = getenv("CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME"); char* basenamePtr = strrchr(argv[0], '/'); char* nodeAccessEnv = NULL; pid_t mypid; if (basenamePtr == NULL) { basenamePtr = argv[0]; } else { basenamePtr++; } chpl_compute_real_binary_name(argv[0]); // command line walltime takes precedence over env var if (!walltime) { walltime = getenv("CHPL_LAUNCHER_WALLTIME"); } // command line partition takes precedence over env var if (!partition) { partition = getenv("CHPL_LAUNCHER_PARTITION"); } // command line exclude list takes precedence over env var if (!exclude) { exclude = getenv("CHPL_LAUNCHER_EXCLUDE"); } // request exclusive node access by default, but allow user to override nodeAccessEnv = getenv("CHPL_LAUNCHER_NODE_ACCESS"); if (nodeAccessEnv == NULL || strcmp(nodeAccessEnv, "exclusive") == 0) { nodeAccessStr = "exclusive"; } else if (strcmp(nodeAccessEnv, "shared") == 0 || strcmp(nodeAccessEnv, "share") == 0 || strcmp(nodeAccessEnv, "oversubscribed") == 0 || strcmp(nodeAccessEnv, "oversubscribe") == 0) { nodeAccessStr = "share"; } else if (strcmp(nodeAccessEnv, "unset") == 0) { nodeAccessStr = NULL; } else { chpl_warning("unsupported 'CHPL_LAUNCHER_NODE_ACCESS' option", 0, 0); nodeAccessStr = "exclusive"; } if (debug) { mypid = 0; } else { mypid = getpid(); } sprintf(expectFilename, "%s%d", baseExpectFilename, (int)mypid); sprintf(slurmFilename, "%s%d", baseSBATCHFilename, (int)mypid); if (getenv("CHPL_LAUNCHER_USE_SBATCH") != NULL) { slurmFile = fopen(slurmFilename, "w"); fprintf(slurmFile, "#!/bin/sh\n\n"); fprintf(slurmFile, "#SBATCH -J Chpl-%.10s\n", basenamePtr); genNumLocalesOptions(slurmFile, determineSlurmVersion(), numLocales, getNumCoresPerLocale()); if (projectString && strlen(projectString) > 0) fprintf(slurmFile, "#SBATCH -A %s\n", projectString); if (getenv("CHPL_LAUNCHER_USE_SBATCH") != NULL) { // fprintf(slurmFile, "#SBATCH -joe\n"); if (outputfn!=NULL) fprintf(slurmFile, "#SBATCH -o %s\n", outputfn); else fprintf(slurmFile, "#SBATCH -o %s.%%j.out\n", argv[0]); // fprintf(slurmFile, "cd $SBATCH_O_WORKDIR\n"); fprintf(slurmFile, "%s/%s/gasnetrun_ibv -n %d -N %d", CHPL_THIRD_PARTY, WRAP_TO_STR(LAUNCH_PATH), numLocales, numLocales); propagate_environment(slurmFile); fprintf(slurmFile, " %s ", chpl_get_real_binary_name()); for (i=1; i<argc; i++) { fprintf(slurmFile, " '%s'", argv[i]); } fprintf(slurmFile, "\n"); } fclose(slurmFile); chmod( slurmFilename, 0755); } if (getenv("CHPL_LAUNCHER_USE_SBATCH") == NULL) { expectFile = fopen(expectFilename, "w"); if (verbosity < 2) { // fprintf(expectFile, "log_user 0\n"); } fprintf(expectFile, "set timeout -1\n"); // fprintf(expectFile, "chmod +x %s\n",slurmFilename); fprintf(expectFile, "set prompt \"(%%|#|\\\\$|>) $\"\n"); // fprintf(expectFile, "spawn sbatch "); fprintf(expectFile, "spawn -noecho salloc --quiet "); fprintf(expectFile, "-J %.10s ",basenamePtr); // pass fprintf(expectFile, "-N %d ",numLocales); fprintf(expectFile, "--ntasks-per-node=1 "); if (nodeAccessStr != NULL) fprintf(expectFile, "--%s ", nodeAccessStr); if (walltime) fprintf(expectFile, "--time=%s ",walltime); if(partition) fprintf(expectFile, "--partition=%s ",partition); if(exclude) fprintf(expectFile, "--exclude=%s ",exclude); if (constraint) { fprintf(expectFile, " -C %s", constraint); } // fprintf(expectFile, "-I %s ", slurmFilename); fprintf(expectFile, " %s/%s/gasnetrun_ibv -n %d -N %d", CHPL_THIRD_PARTY, WRAP_TO_STR(LAUNCH_PATH), numLocales, numLocales); propagate_environment(expectFile); fprintf(expectFile, " %s ", chpl_get_real_binary_name()); for (i=1; i<argc; i++) { fprintf(expectFile, " %s", argv[i]); } // fprintf(expectFile, "\\n\"\n"); fprintf(expectFile, "\n\n"); // fprintf(expectFile, "expect -re $prompt\n"); // fprintf(expectFile, "send \"cd \\$SBATCH_O_WORKDIR\\n\"\n"); // fprintf(expectFile, "expect -re $prompt\n"); // fprintf(expectFile, "sleep 10\n"); // fprintf(expectFile, "interact -o -re $prompt {return}\n"); // fprintf(expectFile, "send_user \"\\n\"\n"); // fprintf(expectFile, "send \"exit\\n\"\n"); fprintf(expectFile, "interact -o -re $prompt {return}\n"); fclose(expectFile); sprintf(baseCommand, "expect %s", expectFilename); } else { // sprintf(baseCommand, "sbatch %s\n", slurmFilename); sprintf(baseCommand, "sbatch %s\n", slurmFilename); } size = strlen(baseCommand) + 1; command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, 0); sprintf(command, "%s", baseCommand); if (strlen(command)+1 > size) { chpl_internal_error("buffer overflow"); } return command; }
static char* chpl_launch_create_command(int argc, char* argv[], int32_t numLocales) { int i; int size; char baseCommand[256]; char* command; FILE* llFile; // char* projectString = getenv(launcherAccountEnvvar); char* basenamePtr = strrchr(argv[0], '/'); pid_t mypid; if (!walltime) { chpl_error("You must specify the wall clock time limit of your job using --walltime\n" "or CHPL_LAUNCHER_WALLTIME (HH:MM:SS)", 0, NULL); } if (basenamePtr == NULL) { basenamePtr = argv[0]; } else { basenamePtr++; } chpl_compute_real_binary_name(argv[0]); if (debug) { mypid = 0; } else { mypid = getpid(); } sprintf(llFilename, "%s%d", baseLLFilename, (int)mypid); llFile = fopen(llFilename, "w"); fprintf(llFile, "#!/bin/bash\n"); if (queue) { fprintf(llFile, "# @ class = %s\n", queue); } fprintf(llFile, "# @ job_name = %s\n", basenamePtr); fprintf(llFile, "# @ initialdir = .\n"); fprintf(llFile, "# @ output = %s_%%j.out\n", basenamePtr); fprintf(llFile, "# @ error = %s_%%j.out\n", basenamePtr); fprintf(llFile, "# @ total_tasks = %d\n", numLocales); fprintf(llFile, "# @ cpus_per_task = 4\n"); fprintf(llFile, "# @ tasks_per_node = 1\n"); fprintf(llFile, "# @ wall_clock_limit = %s\n", walltime); fprintf(llFile, "\n"); #if CHPL_COMM_SUBSTRATE == udp fprintf(llFile, "MLIST=$(/opt/perf/bin/sl_get_machine_list -j=\\$SLURM_JOB_ID )\n"); fprintf(llFile, "\n"); fprintf(llFile, "export -n SSH_SERVERS\n"); fprintf(llFile, "for i in $MLIST ; do\n"); fprintf(llFile, " export SSH_SERVERS=\"$SSH_SERVERS $i\" ;\n"); fprintf(llFile, "done\n"); if (debug) { fprintf(llFile, "echo $SSH_SERVERS\n"); } fprintf(llFile, "\n"); fprintf(llFile, "%samudprun ", WRAP_TO_STR(LAUNCH_PATH)); #elif CHPL_COMM_SUBSTRATE == mpi fprintf(llFile, "export MPIRUN_CMD='srun --kill-on-bad-exit %%C'\n"); fprintf(llFile, "export MPIRUN_CMD_OK=true\n"); fprintf(llFile, "\n"); fprintf(llFile, "%sgasnetrun_mpi ", WRAP_TO_STR(LAUNCH_PATH)); #elif CHPL_COMM_SUBSTRATE == none #else #error "Unknown CHPL_COMM_SUBSTRATE" #endif #if CHPL_COMM_SUBSTRATE != none fprintf(llFile, "-n %d ", numLocales); #endif fprintf(llFile, "%s", chpl_get_real_binary_name()); for (i=1; i<argc; i++) { fprintf(llFile, " '%s'", argv[i]); } fprintf(llFile, " || echo -n \"\"\n"); fclose(llFile); sprintf(baseCommand, "mnsubmit %s", llFilename); size = strlen(baseCommand) + 1; command = chpl_mem_allocMany(size, sizeof(char), CHPL_RT_MD_COMMAND_BUFFER, -1, 0); sprintf(command, "%s", baseCommand); if (strlen(command)+1 > size) { chpl_internal_error("buffer overflow"); } return command; }