예제 #1
0
int main(int argc, char *argv[])
{
   char diagnosis[DRMAA_ERROR_STRING_BUFFER];
   const char *all_jobids[NBULKS*JOB_CHUNK + JOB_CHUNK+1];
   char jobid[100];
   int drmaa_errno, i, pos = 0;
   const char *job_path;
   drmaa_job_template_t *jt;
  
   if (argc<2) {
      fprintf(stderr, "usage: example <path-to-job>\n");
      return 1;
   }
   job_path = argv[1];

   if (drmaa_init(NULL, diagnosis, sizeof(diagnosis)-1) != DRMAA_ERRNO_SUCCESS) {
      fprintf(stderr, "drmaa_init() failed: %s\n", diagnosis);
      return 1;
   }

   /*
    *   submit some bulk jobs
    */
   if (!(jt = create_job_template(job_path, 5, 1))) {
      fprintf(stderr, "create_job_template() failed\n");
      return 1;
   }
   for (i=0; i<NBULKS; i++) {
      drmaa_job_ids_t *jobids;
      int j;

      while ((drmaa_errno=drmaa_run_bulk_jobs(&jobids, jt, 1, JOB_CHUNK, 1, diagnosis,
                 sizeof(diagnosis)-1))==DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE) {
         fprintf(stderr, "drmaa_run_bulk_jobs() failed - retry: %s %s\n", diagnosis, drmaa_strerror(drmaa_errno));
         sleep(1);
      } 
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_run_bulk_jobs() failed: %s %s\n", diagnosis, drmaa_strerror(drmaa_errno));
         return 1;
      }

      printf("submitted bulk job with jobids:\n");
      for (j=0; j<JOB_CHUNK; j++) {
         drmaa_get_next_job_id(jobids, jobid, sizeof(jobid)-1);
         all_jobids[pos++] = strdup(jobid);
         printf("\t \"%s\"\n", jobid);
      } 
      drmaa_release_job_ids(jobids);
   }
   drmaa_delete_job_template(jt, NULL, 0);

   /*
    *   submit some sequential jobs
    */
   if (!(jt = create_job_template(job_path, 5, 0))) {
      fprintf(stderr, "create_sleeper_job_template() failed\n");
      return 1;
   }
   for (i=0; i<JOB_CHUNK; i++) {
      while ((drmaa_errno=drmaa_run_job(jobid, sizeof(jobid)-1, jt, diagnosis,
               sizeof(diagnosis)-1)) == DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE) {
         fprintf(stderr, "drmaa_run_job() failed - retry: %s\n", diagnosis);
         sleep(1);
      }
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_run_job() failed: %s\n", diagnosis);
         return 1;
      }
      printf("\t \"%s\"\n", jobid);
      all_jobids[pos++] = strdup(jobid);
   }

   /* set string array end mark */
   all_jobids[pos] = NULL;

   drmaa_delete_job_template(jt, NULL, 0);

   /*
    *   synchronize with all jobs
    */
   drmaa_errno = drmaa_synchronize(all_jobids, DRMAA_TIMEOUT_WAIT_FOREVER, 0, diagnosis, sizeof(diagnosis)-1);
   if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
      fprintf(stderr, "drmaa_synchronize(DRMAA_JOB_IDS_SESSION_ALL, dispose) failed: %s\n", diagnosis);
      return 1;
   }
   printf("synchronized with all jobs\n");

   /*
    *   wait all those jobs
    */
   for (pos=0; pos<NBULKS*JOB_CHUNK + JOB_CHUNK; pos++) {
      int stat;
      int aborted, exited, exit_status, signaled;

      drmaa_errno = drmaa_wait(all_jobids[pos], jobid, sizeof(jobid)-1, 
         &stat, DRMAA_TIMEOUT_WAIT_FOREVER, NULL, diagnosis, sizeof(diagnosis)-1);

      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_wait(%s) failed: %s\n", all_jobids[pos], diagnosis);
         return 1;
      }

      /*
       * report how job finished 
       */
      drmaa_wifaborted(&aborted, stat, NULL, 0);
      if (aborted)
         printf("job \"%s\" never ran\n", all_jobids[pos]);
      else {
         drmaa_wifexited(&exited, stat, NULL, 0);
         if (exited) {
            drmaa_wexitstatus(&exit_status, stat, NULL, 0);
            printf("job \"%s\" finished regularly with exit status %d\n", 
                  all_jobids[pos], exit_status);
         } else {
            drmaa_wifsignaled(&signaled, stat, NULL, 0);
            if (signaled) {
               char termsig[DRMAA_SIGNAL_BUFFER+1];
               drmaa_wtermsig(termsig, DRMAA_SIGNAL_BUFFER, stat, NULL, 0);
               printf("job \"%s\" finished due to signal %s\n", 
                  all_jobids[pos], termsig);
            } else
               printf("job \"%s\" finished with unclear conditions\n", 
                  all_jobids[pos]);
         }
      }
   }

   if (drmaa_exit(diagnosis, sizeof(diagnosis)-1) != DRMAA_ERRNO_SUCCESS) {
      fprintf(stderr, "drmaa_exit() failed: %s\n", diagnosis);
      return 1;
   }
   
  return 0;
}
예제 #2
0
static void
validate_jobs(drmaa_job_ids_t *jobids_a, int chunks_a, drmaa_job_ids_t *jobids_b, int chunks_b)
{
   int j, status, drmaa_errno;
   char jobid[512];
   const char *all_jobs[] = { "DRMAA_JOB_IDS_SESSION_ALL" };
   char **ids_a, **ids_b;

   printf("JobA: chunksize %d chunks %d\n", chunks_a, BULK_SIZE/chunks_a);
   printf("JobB: chunksize %d chunks %d\n", chunks_b, BULK_SIZE/chunks_b);

   ids_a = (char **)malloc(sizeof(char *)*BULK_SIZE/chunks_a);
   ids_b = (char **)malloc(sizeof(char *)*BULK_SIZE/chunks_b);
   if (!ids_a || !ids_b) {
      fprintf(stderr, "malloc() failed\n");
      exit(1);
   }

   /* dup job A ids and verify user hold due to -h */
   for (j = 0; j < BULK_SIZE/chunks_a; j++) {

      drmaa_errno = drmaa_get_next_job_id(jobids_a, jobid, sizeof(jobid)-1);
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_get_next_job_id failed: %s\n", errorbuf);
         exit(1);
      }
      /* drmaa_job_ps(3) to return either DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD for 
       * array tasks that are in hold due to -hold_jid_ad wc_job_list */
      drmaa_errno = drmaa_job_ps(jobid, &status, errorbuf, sizeof(errorbuf)-1);
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf);
         exit(1);
      }

      if (status != DRMAA_PS_USER_ON_HOLD) {
         fprintf(stderr, "drmaa_job_ps failed: didn't return DRMAA_PS_USER_ON_HOLD but %d\n", status);
         exit(1);
      }
      ids_a[j] = strdup(jobid);
   }

   /* dup job B ids and verify system hold due to -hold_jid_ad */
   for (j = 0; j < BULK_SIZE/chunks_b; j++) {

      drmaa_errno = drmaa_get_next_job_id(jobids_b, jobid, sizeof(jobid)-1);
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_get_next_job_id failed: %s\n", errorbuf);
         exit(1);
      }
      /* drmaa_job_ps(3) to return either DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD for 
       * array tasks that are in hold due to -hold_jid_ad wc_job_list */
      drmaa_errno = drmaa_job_ps(jobid, &status, errorbuf, sizeof(errorbuf)-1);
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf);
         exit(1);
      }

      if (status != DRMAA_PS_SYSTEM_ON_HOLD &&
          status != DRMAA_PS_USER_SYSTEM_ON_HOLD) {
         fprintf(stderr, "drmaa_job_ps(%s) failed: didn't return DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD but %d\n", 
               jobid, status);
         exit(1);
      }

      ids_b[j] = strdup(jobid);
   }

   state_monitor(ids_a, chunks_a, ids_b, chunks_b);
   /* release job A */
   for (j = 0; j < BULK_SIZE/chunks_a; j++) {
      printf("drmaa_control(%s, DRMAA_CONTROL_RELEASE)\n", ids_a[j]);
      if (drmaa_control(ids_a[j], DRMAA_CONTROL_RELEASE, errorbuf, sizeof(errorbuf)-1) != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf);
         exit(1);
      }
   }

   while (drmaa_synchronize(all_jobs, 1, 0, errorbuf, sizeof(errorbuf)-1)==DRMAA_ERRNO_EXIT_TIMEOUT) {
      state_monitor(ids_a, chunks_a, ids_b, chunks_b);
   }
   state_monitor(ids_a, chunks_a, ids_b, chunks_b);

   for (j = 0; j < BULK_SIZE/chunks_a; j++) {
      sge_free(&(ids_a[j]));
   }   
   sge_free(&ids_a);

   for (j = 0; j < BULK_SIZE/chunks_b; j++) {
      sge_free(&(ids_b[j]));
   }   
   sge_free(&ids_b);
}
예제 #3
0
 int launch (const char* script, const char* arg_vec[], int num_tasks) {
    char error[DRMAA_ERROR_STRING_BUFFER];
    int errnum = 0;
    drmaa_job_template_t *jt = NULL;
 
   errnum = drmaa_init (NULL, error, DRMAA_ERROR_STRING_BUFFER);
 
    if (errnum != DRMAA_ERRNO_SUCCESS) {
       fprintf (stderr, "Could not initialize the DRMAA library: %s\n", error);
       return 1;
    }
 
    errnum = drmaa_allocate_job_template (&jt, error, DRMAA_ERROR_STRING_BUFFER);
 
    if (errnum != DRMAA_ERRNO_SUCCESS) {
       fprintf (stderr, "Could not create job template: %s\n", error);
    }
    else {
       errnum = drmaa_set_attribute (jt, DRMAA_REMOTE_COMMAND, script,
                                     error, DRMAA_ERROR_STRING_BUFFER);
 
       if (errnum != DRMAA_ERRNO_SUCCESS) {
          fprintf (stderr, "Could not set attribute \"%s\": %s\n",
                   DRMAA_REMOTE_COMMAND, error);
       }
       else {
        
          
          errnum = drmaa_set_vector_attribute (jt, DRMAA_V_ARGV, arg_vec, error,
                                               DRMAA_ERROR_STRING_BUFFER);
       }
       
       if (errnum != DRMAA_ERRNO_SUCCESS) {
          fprintf (stderr, "Could not set attribute \"%s\": %s\n",
                   DRMAA_REMOTE_COMMAND, error);
       }else {
         drmaa_job_ids_t *ids = NULL;
 
          errnum = drmaa_run_bulk_jobs (&ids, jt, 1, num_tasks, 1, error, DRMAA_ERROR_STRING_BUFFER);
 
          if (errnum != DRMAA_ERRNO_SUCCESS) {
             fprintf (stderr, "Could not submit job: %s\n", error);
          }else {
             char jobid[DRMAA_JOBNAME_BUFFER];
		const char *jobids[2] = {DRMAA_JOB_IDS_SESSION_ALL, NULL};
             while (drmaa_get_next_job_id (ids, jobid, DRMAA_JOBNAME_BUFFER) == DRMAA_ERRNO_SUCCESS) {
                printf ("A job task has been submitted with id %s\n", jobid);
             }
          
 	     errnum = drmaa_synchronize (jobids, DRMAA_TIMEOUT_WAIT_FOREVER,
                                         1, error, DRMAA_ERROR_STRING_BUFFER);
             
             if (errnum != DRMAA_ERRNO_SUCCESS) {
                fprintf (stderr, "Could not wait for jobs: %s\n", error);
             }else {
                printf ("All job tasks have finished.\n");
             }
         } 
           drmaa_release_job_ids (ids);
       } /* else */
 
       errnum = drmaa_delete_job_template (jt, error, DRMAA_ERROR_STRING_BUFFER);
 
       if (errnum != DRMAA_ERRNO_SUCCESS) {
          fprintf (stderr, "Could not delete job template: %s\n", error);
       }
    } /* else */
 
    errnum = drmaa_exit (error, DRMAA_ERROR_STRING_BUFFER);
 
    if (errnum != DRMAA_ERRNO_SUCCESS) {
       fprintf (stderr, "Could not shut down the DRMAA library: %s\n", error);
      return 1;
   }

    return 0;
 }