int main(int argc, char *argv[]) { char diagnosis[DRMAA_ERROR_STRING_BUFFER]; const char *all_jobids[NBULKS*JOB_CHUNK + JOB_CHUNK+1]; char jobid[100]; int drmaa_errno, i, pos = 0; const char *job_path; drmaa_job_template_t *jt; if (argc<2) { fprintf(stderr, "usage: example <path-to-job>\n"); return 1; } job_path = argv[1]; if (drmaa_init(NULL, diagnosis, sizeof(diagnosis)-1) != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_init() failed: %s\n", diagnosis); return 1; } /* * submit some bulk jobs */ if (!(jt = create_job_template(job_path, 5, 1))) { fprintf(stderr, "create_job_template() failed\n"); return 1; } for (i=0; i<NBULKS; i++) { drmaa_job_ids_t *jobids; int j; while ((drmaa_errno=drmaa_run_bulk_jobs(&jobids, jt, 1, JOB_CHUNK, 1, diagnosis, sizeof(diagnosis)-1))==DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE) { fprintf(stderr, "drmaa_run_bulk_jobs() failed - retry: %s %s\n", diagnosis, drmaa_strerror(drmaa_errno)); sleep(1); } if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_run_bulk_jobs() failed: %s %s\n", diagnosis, drmaa_strerror(drmaa_errno)); return 1; } printf("submitted bulk job with jobids:\n"); for (j=0; j<JOB_CHUNK; j++) { drmaa_get_next_job_id(jobids, jobid, sizeof(jobid)-1); all_jobids[pos++] = strdup(jobid); printf("\t \"%s\"\n", jobid); } drmaa_release_job_ids(jobids); } drmaa_delete_job_template(jt, NULL, 0); /* * submit some sequential jobs */ if (!(jt = create_job_template(job_path, 5, 0))) { fprintf(stderr, "create_sleeper_job_template() failed\n"); return 1; } for (i=0; i<JOB_CHUNK; i++) { while ((drmaa_errno=drmaa_run_job(jobid, sizeof(jobid)-1, jt, diagnosis, sizeof(diagnosis)-1)) == DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE) { fprintf(stderr, "drmaa_run_job() failed - retry: %s\n", diagnosis); sleep(1); } if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_run_job() failed: %s\n", diagnosis); return 1; } printf("\t \"%s\"\n", jobid); all_jobids[pos++] = strdup(jobid); } /* set string array end mark */ all_jobids[pos] = NULL; drmaa_delete_job_template(jt, NULL, 0); /* * synchronize with all jobs */ drmaa_errno = drmaa_synchronize(all_jobids, DRMAA_TIMEOUT_WAIT_FOREVER, 0, diagnosis, sizeof(diagnosis)-1); if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_synchronize(DRMAA_JOB_IDS_SESSION_ALL, dispose) failed: %s\n", diagnosis); return 1; } printf("synchronized with all jobs\n"); /* * wait all those jobs */ for (pos=0; pos<NBULKS*JOB_CHUNK + JOB_CHUNK; pos++) { int stat; int aborted, exited, exit_status, signaled; drmaa_errno = drmaa_wait(all_jobids[pos], jobid, sizeof(jobid)-1, &stat, DRMAA_TIMEOUT_WAIT_FOREVER, NULL, diagnosis, sizeof(diagnosis)-1); if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_wait(%s) failed: %s\n", all_jobids[pos], diagnosis); return 1; } /* * report how job finished */ drmaa_wifaborted(&aborted, stat, NULL, 0); if (aborted) printf("job \"%s\" never ran\n", all_jobids[pos]); else { drmaa_wifexited(&exited, stat, NULL, 0); if (exited) { drmaa_wexitstatus(&exit_status, stat, NULL, 0); printf("job \"%s\" finished regularly with exit status %d\n", all_jobids[pos], exit_status); } else { drmaa_wifsignaled(&signaled, stat, NULL, 0); if (signaled) { char termsig[DRMAA_SIGNAL_BUFFER+1]; drmaa_wtermsig(termsig, DRMAA_SIGNAL_BUFFER, stat, NULL, 0); printf("job \"%s\" finished due to signal %s\n", all_jobids[pos], termsig); } else printf("job \"%s\" finished with unclear conditions\n", all_jobids[pos]); } } } if (drmaa_exit(diagnosis, sizeof(diagnosis)-1) != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_exit() failed: %s\n", diagnosis); return 1; } return 0; }
static void validate_jobs(drmaa_job_ids_t *jobids_a, int chunks_a, drmaa_job_ids_t *jobids_b, int chunks_b) { int j, status, drmaa_errno; char jobid[512]; const char *all_jobs[] = { "DRMAA_JOB_IDS_SESSION_ALL" }; char **ids_a, **ids_b; printf("JobA: chunksize %d chunks %d\n", chunks_a, BULK_SIZE/chunks_a); printf("JobB: chunksize %d chunks %d\n", chunks_b, BULK_SIZE/chunks_b); ids_a = (char **)malloc(sizeof(char *)*BULK_SIZE/chunks_a); ids_b = (char **)malloc(sizeof(char *)*BULK_SIZE/chunks_b); if (!ids_a || !ids_b) { fprintf(stderr, "malloc() failed\n"); exit(1); } /* dup job A ids and verify user hold due to -h */ for (j = 0; j < BULK_SIZE/chunks_a; j++) { drmaa_errno = drmaa_get_next_job_id(jobids_a, jobid, sizeof(jobid)-1); if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_get_next_job_id failed: %s\n", errorbuf); exit(1); } /* drmaa_job_ps(3) to return either DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD for * array tasks that are in hold due to -hold_jid_ad wc_job_list */ drmaa_errno = drmaa_job_ps(jobid, &status, errorbuf, sizeof(errorbuf)-1); if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf); exit(1); } if (status != DRMAA_PS_USER_ON_HOLD) { fprintf(stderr, "drmaa_job_ps failed: didn't return DRMAA_PS_USER_ON_HOLD but %d\n", status); exit(1); } ids_a[j] = strdup(jobid); } /* dup job B ids and verify system hold due to -hold_jid_ad */ for (j = 0; j < BULK_SIZE/chunks_b; j++) { drmaa_errno = drmaa_get_next_job_id(jobids_b, jobid, sizeof(jobid)-1); if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_get_next_job_id failed: %s\n", errorbuf); exit(1); } /* drmaa_job_ps(3) to return either DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD for * array tasks that are in hold due to -hold_jid_ad wc_job_list */ drmaa_errno = drmaa_job_ps(jobid, &status, errorbuf, sizeof(errorbuf)-1); if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf); exit(1); } if (status != DRMAA_PS_SYSTEM_ON_HOLD && status != DRMAA_PS_USER_SYSTEM_ON_HOLD) { fprintf(stderr, "drmaa_job_ps(%s) failed: didn't return DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD but %d\n", jobid, status); exit(1); } ids_b[j] = strdup(jobid); } state_monitor(ids_a, chunks_a, ids_b, chunks_b); /* release job A */ for (j = 0; j < BULK_SIZE/chunks_a; j++) { printf("drmaa_control(%s, DRMAA_CONTROL_RELEASE)\n", ids_a[j]); if (drmaa_control(ids_a[j], DRMAA_CONTROL_RELEASE, errorbuf, sizeof(errorbuf)-1) != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf); exit(1); } } while (drmaa_synchronize(all_jobs, 1, 0, errorbuf, sizeof(errorbuf)-1)==DRMAA_ERRNO_EXIT_TIMEOUT) { state_monitor(ids_a, chunks_a, ids_b, chunks_b); } state_monitor(ids_a, chunks_a, ids_b, chunks_b); for (j = 0; j < BULK_SIZE/chunks_a; j++) { sge_free(&(ids_a[j])); } sge_free(&ids_a); for (j = 0; j < BULK_SIZE/chunks_b; j++) { sge_free(&(ids_b[j])); } sge_free(&ids_b); }
int launch (const char* script, const char* arg_vec[], int num_tasks) { char error[DRMAA_ERROR_STRING_BUFFER]; int errnum = 0; drmaa_job_template_t *jt = NULL; errnum = drmaa_init (NULL, error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not initialize the DRMAA library: %s\n", error); return 1; } errnum = drmaa_allocate_job_template (&jt, error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not create job template: %s\n", error); } else { errnum = drmaa_set_attribute (jt, DRMAA_REMOTE_COMMAND, script, error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not set attribute \"%s\": %s\n", DRMAA_REMOTE_COMMAND, error); } else { errnum = drmaa_set_vector_attribute (jt, DRMAA_V_ARGV, arg_vec, error, DRMAA_ERROR_STRING_BUFFER); } if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not set attribute \"%s\": %s\n", DRMAA_REMOTE_COMMAND, error); }else { drmaa_job_ids_t *ids = NULL; errnum = drmaa_run_bulk_jobs (&ids, jt, 1, num_tasks, 1, error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not submit job: %s\n", error); }else { char jobid[DRMAA_JOBNAME_BUFFER]; const char *jobids[2] = {DRMAA_JOB_IDS_SESSION_ALL, NULL}; while (drmaa_get_next_job_id (ids, jobid, DRMAA_JOBNAME_BUFFER) == DRMAA_ERRNO_SUCCESS) { printf ("A job task has been submitted with id %s\n", jobid); } errnum = drmaa_synchronize (jobids, DRMAA_TIMEOUT_WAIT_FOREVER, 1, error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not wait for jobs: %s\n", error); }else { printf ("All job tasks have finished.\n"); } } drmaa_release_job_ids (ids); } /* else */ errnum = drmaa_delete_job_template (jt, error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not delete job template: %s\n", error); } } /* else */ errnum = drmaa_exit (error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not shut down the DRMAA library: %s\n", error); return 1; } return 0; }