/* * Set up the GPU proxy service if requested to do so through the * CRAY_CUDA_MPS or CRAY_CUDA_PROXY environment variables. * Returns SLURM_SUCCESS or SLURM_ERROR. */ int setup_gpu(stepd_step_rec_t *job) { int rc, gpu_enable; char *err_msg; gpu_enable = _get_mps_request(job); if (gpu_enable > 1) { // No action required, just exit with success return SLURM_SUCCESS; } // Establish GPU's default state // NOTE: We have to redo this for every job because the job_init call // is made from the stepd, so the default state in the slurmd is wiped debug2("Getting default GPU mps state"); rc = alpsc_establish_GPU_mps_def_state(&err_msg); ALPSC_CN_DEBUG("alpsc_establish_GPU_mps_def_state"); if (rc != 1) { return SLURM_ERROR; } // If the request is different than the default, perform the // required action. debug2("Setting GPU mps state to %d prior to launch", gpu_enable); rc = alpsc_pre_launch_GPU_mps(&err_msg, gpu_enable); ALPSC_CN_DEBUG("alpsc_pre_launch_GPU_mps"); if (rc != 1) { return SLURM_ERROR; } return SLURM_SUCCESS; }
/* * Reset the gpu to its default state after the job completes. * */ int reset_gpu(stepd_step_rec_t *job) { int rc, gpu_enable; char *err_msg; gpu_enable = _get_mps_request(job); if (gpu_enable > 1) { // No action required, return with success. return SLURM_SUCCESS; } debug2("Resetting GPU mps state from %d after launch", gpu_enable); rc = alpsc_post_launch_GPU_mps(&err_msg, gpu_enable); ALPSC_CN_DEBUG("alpsc_post_launch_GPU_mps"); if (rc != 1) { return SLURM_ERROR; } return SLURM_SUCCESS; }
/* * Write the IAA file and set the filename in the job's environment */ int write_iaa_file(stepd_step_rec_t *job, slurm_cray_jobinfo_t *sw_job, int *ptags, int num_ptags, alpsc_peInfo_t *alpsc_pe_info) { char *fname = xstrdup_printf(CRAY_IAA_FILE, sw_job->apid); int rc, ret = SLURM_ERROR; char *err_msg = NULL; do { // Write the file rc = alpsc_write_iaa_info(&err_msg, fname, sw_job->num_cookies, (const char **)sw_job->cookies, num_ptags, ptags, alpsc_pe_info); ALPSC_CN_DEBUG("alpsc_write_iaa_info"); if (rc != 1) { break; } // chown the file to the job user rc = chown(fname, job->uid, job->gid); if (rc == -1) { CRAY_ERR("chown(%s, %d, %d) failed: %m", fname, (int)job->uid, (int)job->gid); break; } // Write the environment variable rc = env_array_overwrite(&job->env, CRAY_IAA_INFO_FILE_ENV, fname); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", CRAY_IAA_INFO_FILE_ENV); break; } ret = SLURM_SUCCESS; } while(0); xfree(fname); return ret; }