static void ids_count_status(char **ids, int size, int chunks, int *hold, int *pending, int *running, int *done) { int j, drmaa_errno, status; for (j = 0; j < size; j++) { if (!ids[j]) { *done += chunks; continue; /* drmaa_job_ps() won't work for already reaped jobs */ } while ((drmaa_errno = drmaa_job_ps(ids[j], &status, errorbuf, sizeof(errorbuf)-1))==DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE) sleep(1); if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf); exit(1); } switch (status) { case DRMAA_PS_SYSTEM_ON_HOLD: case DRMAA_PS_USER_ON_HOLD: case DRMAA_PS_USER_SYSTEM_ON_HOLD: *hold += chunks; break; case DRMAA_PS_QUEUED_ACTIVE: *pending += chunks; break; case DRMAA_PS_RUNNING: *running += chunks; break; case DRMAA_PS_DONE: case DRMAA_PS_UNDETERMINED: *done += chunks; break; default: break; } } return; }
void print_job_status(char *id) { int rc; char error[DRMAA_ERROR_STRING_BUFFER]; int status; /* -------- Check job state ------------*/ rc = drmaa_job_ps(id, &status, error, DRMAA_ERROR_STRING_BUFFER-1); if ( rc != DRMAA_ERRNO_SUCCESS) { fprintf(stderr,"drmaa_job_ps() failed: %s\n", error); exit(-1); } /***********************************************************/ /* drmaa_gw_strstatus is not a DRMAA 1.0 function */ /* it is only provided by Gridway */ /***********************************************************/ fprintf(stdout,"Job state is: %s\n",drmaa_gw_strstatus(status)); }
static void validate_jobs(drmaa_job_ids_t *jobids_a, int chunks_a, drmaa_job_ids_t *jobids_b, int chunks_b) { int j, status, drmaa_errno; char jobid[512]; const char *all_jobs[] = { "DRMAA_JOB_IDS_SESSION_ALL" }; char **ids_a, **ids_b; printf("JobA: chunksize %d chunks %d\n", chunks_a, BULK_SIZE/chunks_a); printf("JobB: chunksize %d chunks %d\n", chunks_b, BULK_SIZE/chunks_b); ids_a = (char **)malloc(sizeof(char *)*BULK_SIZE/chunks_a); ids_b = (char **)malloc(sizeof(char *)*BULK_SIZE/chunks_b); if (!ids_a || !ids_b) { fprintf(stderr, "malloc() failed\n"); exit(1); } /* dup job A ids and verify user hold due to -h */ for (j = 0; j < BULK_SIZE/chunks_a; j++) { drmaa_errno = drmaa_get_next_job_id(jobids_a, jobid, sizeof(jobid)-1); if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_get_next_job_id failed: %s\n", errorbuf); exit(1); } /* drmaa_job_ps(3) to return either DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD for * array tasks that are in hold due to -hold_jid_ad wc_job_list */ drmaa_errno = drmaa_job_ps(jobid, &status, errorbuf, sizeof(errorbuf)-1); if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf); exit(1); } if (status != DRMAA_PS_USER_ON_HOLD) { fprintf(stderr, "drmaa_job_ps failed: didn't return DRMAA_PS_USER_ON_HOLD but %d\n", status); exit(1); } ids_a[j] = strdup(jobid); } /* dup job B ids and verify system hold due to -hold_jid_ad */ for (j = 0; j < BULK_SIZE/chunks_b; j++) { drmaa_errno = drmaa_get_next_job_id(jobids_b, jobid, sizeof(jobid)-1); if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_get_next_job_id failed: %s\n", errorbuf); exit(1); } /* drmaa_job_ps(3) to return either DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD for * array tasks that are in hold due to -hold_jid_ad wc_job_list */ drmaa_errno = drmaa_job_ps(jobid, &status, errorbuf, sizeof(errorbuf)-1); if (drmaa_errno != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf); exit(1); } if (status != DRMAA_PS_SYSTEM_ON_HOLD && status != DRMAA_PS_USER_SYSTEM_ON_HOLD) { fprintf(stderr, "drmaa_job_ps(%s) failed: didn't return DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD but %d\n", jobid, status); exit(1); } ids_b[j] = strdup(jobid); } state_monitor(ids_a, chunks_a, ids_b, chunks_b); /* release job A */ for (j = 0; j < BULK_SIZE/chunks_a; j++) { printf("drmaa_control(%s, DRMAA_CONTROL_RELEASE)\n", ids_a[j]); if (drmaa_control(ids_a[j], DRMAA_CONTROL_RELEASE, errorbuf, sizeof(errorbuf)-1) != DRMAA_ERRNO_SUCCESS) { fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf); exit(1); } } while (drmaa_synchronize(all_jobs, 1, 0, errorbuf, sizeof(errorbuf)-1)==DRMAA_ERRNO_EXIT_TIMEOUT) { state_monitor(ids_a, chunks_a, ids_b, chunks_b); } state_monitor(ids_a, chunks_a, ids_b, chunks_b); for (j = 0; j < BULK_SIZE/chunks_a; j++) { sge_free(&(ids_a[j])); } sge_free(&ids_a); for (j = 0; j < BULK_SIZE/chunks_b; j++) { sge_free(&(ids_b[j])); } sge_free(&ids_b); }
int main (int argc, char **argv) { char error[DRMAA_ERROR_STRING_BUFFER]; int errnum = 0; drmaa_job_template_t *jt = NULL; errnum = drmaa_init (NULL, error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not initialize the DRMAA library: %s\n", error); return 1; } errnum = drmaa_allocate_job_template (&jt, error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not create job template: %s\n", error); } else { errnum = drmaa_set_attribute (jt, DRMAA_REMOTE_COMMAND, "sleeper.sh", error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not set attribute \"%s\": %s\n", DRMAA_REMOTE_COMMAND, error); } else { const char *args[2] = {"60", NULL}; errnum = drmaa_set_vector_attribute (jt, DRMAA_V_ARGV, args, error, DRMAA_ERROR_STRING_BUFFER); } if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not set attribute \"%s\": %s\n", DRMAA_REMOTE_COMMAND, error); } else { char jobid[DRMAA_JOBNAME_BUFFER]; errnum = drmaa_run_job (jobid, DRMAA_JOBNAME_BUFFER, jt, error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not submit job: %s\n", error); } else { int status = 0; printf ("Your job has been submitted with id %s\n", jobid); sleep (20); errnum = drmaa_job_ps (jobid, &status, error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not get job' status: %s\n", error); } else { switch (status) { case DRMAA_PS_UNDETERMINED: printf ("Job status cannot be determined\n"); break; case DRMAA_PS_QUEUED_ACTIVE: printf ("Job is queued and active\n"); break; case DRMAA_PS_SYSTEM_ON_HOLD: printf ("Job is queued and in system hold\n"); break; case DRMAA_PS_USER_ON_HOLD: printf ("Job is queued and in user hold\n"); break; case DRMAA_PS_USER_SYSTEM_ON_HOLD: printf ("Job is queued and in user and system hold\n"); break; case DRMAA_PS_RUNNING: printf ("Job is running\n"); break; case DRMAA_PS_SYSTEM_SUSPENDED: printf ("Job is system suspended\n"); break; case DRMAA_PS_USER_SUSPENDED: printf ("Job is user suspended\n"); break; case DRMAA_PS_USER_SYSTEM_SUSPENDED: printf ("Job is user and system suspended\n"); break; case DRMAA_PS_DONE: printf ("Job finished normally\n"); break; case DRMAA_PS_FAILED: printf ("Job finished, but failed\n"); break; } /* switch */ } /* else */ } /* else */ } /* else */ errnum = drmaa_delete_job_template (jt, error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not delete job template: %s\n", error); } } /* else */ errnum = drmaa_exit (error, DRMAA_ERROR_STRING_BUFFER); if (errnum != DRMAA_ERRNO_SUCCESS) { fprintf (stderr, "Could not shut down the DRMAA library: %s\n", error); return 1; } return 0; }
void *run(void *arg) { int ret = DRMAA_ERRNO_SUCCESS; char error[DRMAA_ERROR_STRING_BUFFER + 1]; drmaa_job_template_t *jt = NULL; int run = ((int *)arg)[0]; int thread = ((int *)arg)[1]; char jobid[DRMAA_JOBNAME_BUFFER + 1]; int queued = 1; int running = 0; int status = -1; free(arg); ret = drmaa_allocate_job_template(&jt, error, DRMAA_ERROR_STRING_BUFFER); if (handle_code(ret, error, run, thread) == 1) { return NULL; } ret = drmaa_set_attribute(jt, DRMAA_REMOTE_COMMAND, CMD, error, DRMAA_ERROR_STRING_BUFFER); if (handle_code(ret, error, run, thread) == 1) { return NULL; } ret = drmaa_set_attribute(jt, DRMAA_WD, WD, error, DRMAA_ERROR_STRING_BUFFER); if (handle_code(ret, error, run, thread) == 1) { return NULL; } ret = drmaa_set_attribute(jt, DRMAA_JOB_CATEGORY, CATEGORY, error, DRMAA_ERROR_STRING_BUFFER); if (handle_code(ret, error, run, thread) == 1) { return NULL; } printf("%d %d SETUP complete %ld\n", run, thread, time(NULL)); ret = drmaa_run_job(jobid, DRMAA_JOBNAME_BUFFER, jt, error, DRMAA_ERROR_STRING_BUFFER); if (handle_code(ret, error, run, thread) == 1) { return NULL; } printf("%d %d SUBMITTED jobid: %s %ld\n", run, thread, jobid, time(NULL)); ret = drmaa_delete_job_template(jt, error, DRMAA_ERROR_STRING_BUFFER); handle_code(ret, error, run, thread); while (queued) { ret = drmaa_wait(jobid, NULL, 0, NULL, 2, NULL, error, DRMAA_ERROR_STRING_BUFFER); if (ret != DRMAA_ERRNO_EXIT_TIMEOUT) { if (handle_code(ret, error, run, thread) == 1) { return NULL; } } else { printf ("%d %d TIMEOUT jobid: %s %ld\n", run, thread, jobid, time (NULL)); } ret = drmaa_job_ps(jobid, &status, error, DRMAA_ERROR_STRING_BUFFER); if (handle_code(ret, error, run, thread) == 1) { return NULL; } queued = (status == DRMAA_PS_QUEUED_ACTIVE) || (status == DRMAA_PS_SYSTEM_ON_HOLD) || (status == DRMAA_PS_SYSTEM_ON_HOLD) || (status == DRMAA_PS_USER_ON_HOLD) || (status == DRMAA_PS_USER_SYSTEM_ON_HOLD); } printf("%d %d RUNNING jobid: %s %ld\n", run, thread, jobid, time(NULL)); running = 1; while (running == 1) { ret = drmaa_wait(jobid, NULL, 0, NULL, 60, NULL, error, DRMAA_ERROR_STRING_BUFFER); if (ret != DRMAA_ERRNO_EXIT_TIMEOUT) { if (handle_code(ret, error, run, thread) == 1) { return NULL; } running = 0; printf("%d %d FINISHED jobid: %s %ld\n", run, thread, jobid, time(NULL)); } else { printf ("%d %d TIMEOUT jobid: %s %ld\n", run, thread, jobid, time (NULL)); ret = drmaa_job_ps(jobid, &status, error, DRMAA_ERROR_STRING_BUFFER); if (handle_code(ret, error, run, thread) == 1) { return NULL; } if (status != DRMAA_PS_RUNNING) { running = 0; printf("%d %d HUNG jobid: %s %ld\n", run, thread, jobid, time(NULL)); } } } return NULL; }