Пример #1
0
static void ids_count_status(char **ids, int size, int chunks, int *hold, int *pending, int *running, int *done)
{
   int j, drmaa_errno, status;

   for (j = 0; j < size; j++) {

      if (!ids[j]) {
         *done += chunks;
         continue; /* drmaa_job_ps() won't work for already reaped jobs */
      }
      while ((drmaa_errno = drmaa_job_ps(ids[j], &status, errorbuf, sizeof(errorbuf)-1))==DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE)
         sleep(1);

      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf);
         exit(1);
      }

      switch (status) {
      case DRMAA_PS_SYSTEM_ON_HOLD:
      case DRMAA_PS_USER_ON_HOLD:
      case DRMAA_PS_USER_SYSTEM_ON_HOLD:
          *hold += chunks;
          break;

      case DRMAA_PS_QUEUED_ACTIVE:
          *pending += chunks;
          break;

      case DRMAA_PS_RUNNING:
          *running += chunks;
          break;

      case DRMAA_PS_DONE:
      case DRMAA_PS_UNDETERMINED:
          *done += chunks;
          break;

      default:
          break;
      }
   }

   return;
}
Пример #2
0
void print_job_status(char *id)
{
	int  rc;
    char error[DRMAA_ERROR_STRING_BUFFER];
    int  status;

    /* -------- Check job state ------------*/

	rc = drmaa_job_ps(id, &status, error, DRMAA_ERROR_STRING_BUFFER-1);

    if ( rc != DRMAA_ERRNO_SUCCESS)
    {
        fprintf(stderr,"drmaa_job_ps() failed: %s\n", error);
        exit(-1);
    }

    /***********************************************************/
    /* drmaa_gw_strstatus is not a DRMAA 1.0 function          */
    /* it is only provided by Gridway                          */
 	  /***********************************************************/

    fprintf(stdout,"Job state is: %s\n",drmaa_gw_strstatus(status));
}
Пример #3
0
static void
validate_jobs(drmaa_job_ids_t *jobids_a, int chunks_a, drmaa_job_ids_t *jobids_b, int chunks_b)
{
   int j, status, drmaa_errno;
   char jobid[512];
   const char *all_jobs[] = { "DRMAA_JOB_IDS_SESSION_ALL" };
   char **ids_a, **ids_b;

   printf("JobA: chunksize %d chunks %d\n", chunks_a, BULK_SIZE/chunks_a);
   printf("JobB: chunksize %d chunks %d\n", chunks_b, BULK_SIZE/chunks_b);

   ids_a = (char **)malloc(sizeof(char *)*BULK_SIZE/chunks_a);
   ids_b = (char **)malloc(sizeof(char *)*BULK_SIZE/chunks_b);
   if (!ids_a || !ids_b) {
      fprintf(stderr, "malloc() failed\n");
      exit(1);
   }

   /* dup job A ids and verify user hold due to -h */
   for (j = 0; j < BULK_SIZE/chunks_a; j++) {

      drmaa_errno = drmaa_get_next_job_id(jobids_a, jobid, sizeof(jobid)-1);
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_get_next_job_id failed: %s\n", errorbuf);
         exit(1);
      }
      /* drmaa_job_ps(3) to return either DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD for 
       * array tasks that are in hold due to -hold_jid_ad wc_job_list */
      drmaa_errno = drmaa_job_ps(jobid, &status, errorbuf, sizeof(errorbuf)-1);
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf);
         exit(1);
      }

      if (status != DRMAA_PS_USER_ON_HOLD) {
         fprintf(stderr, "drmaa_job_ps failed: didn't return DRMAA_PS_USER_ON_HOLD but %d\n", status);
         exit(1);
      }
      ids_a[j] = strdup(jobid);
   }

   /* dup job B ids and verify system hold due to -hold_jid_ad */
   for (j = 0; j < BULK_SIZE/chunks_b; j++) {

      drmaa_errno = drmaa_get_next_job_id(jobids_b, jobid, sizeof(jobid)-1);
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_get_next_job_id failed: %s\n", errorbuf);
         exit(1);
      }
      /* drmaa_job_ps(3) to return either DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD for 
       * array tasks that are in hold due to -hold_jid_ad wc_job_list */
      drmaa_errno = drmaa_job_ps(jobid, &status, errorbuf, sizeof(errorbuf)-1);
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf);
         exit(1);
      }

      if (status != DRMAA_PS_SYSTEM_ON_HOLD &&
          status != DRMAA_PS_USER_SYSTEM_ON_HOLD) {
         fprintf(stderr, "drmaa_job_ps(%s) failed: didn't return DRMAA_PS_SYSTEM_ON_HOLD or DRMAA_PS_USER_SYSTEM_ON_HOLD but %d\n", 
               jobid, status);
         exit(1);
      }

      ids_b[j] = strdup(jobid);
   }

   state_monitor(ids_a, chunks_a, ids_b, chunks_b);
   /* release job A */
   for (j = 0; j < BULK_SIZE/chunks_a; j++) {
      printf("drmaa_control(%s, DRMAA_CONTROL_RELEASE)\n", ids_a[j]);
      if (drmaa_control(ids_a[j], DRMAA_CONTROL_RELEASE, errorbuf, sizeof(errorbuf)-1) != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_job_ps failed: %s\n", errorbuf);
         exit(1);
      }
   }

   while (drmaa_synchronize(all_jobs, 1, 0, errorbuf, sizeof(errorbuf)-1)==DRMAA_ERRNO_EXIT_TIMEOUT) {
      state_monitor(ids_a, chunks_a, ids_b, chunks_b);
   }
   state_monitor(ids_a, chunks_a, ids_b, chunks_b);

   for (j = 0; j < BULK_SIZE/chunks_a; j++) {
      sge_free(&(ids_a[j]));
   }   
   sge_free(&ids_a);

   for (j = 0; j < BULK_SIZE/chunks_b; j++) {
      sge_free(&(ids_b[j]));
   }   
   sge_free(&ids_b);
}
Пример #4
0
int main (int argc, char **argv) {
   char error[DRMAA_ERROR_STRING_BUFFER];
   int errnum = 0;
   drmaa_job_template_t *jt = NULL;

   errnum = drmaa_init (NULL, error, DRMAA_ERROR_STRING_BUFFER);

   if (errnum != DRMAA_ERRNO_SUCCESS) {
      fprintf (stderr, "Could not initialize the DRMAA library: %s\n", error);
      return 1;
   }

   errnum = drmaa_allocate_job_template (&jt, error, DRMAA_ERROR_STRING_BUFFER);

   if (errnum != DRMAA_ERRNO_SUCCESS) {
      fprintf (stderr, "Could not create job template: %s\n", error);
   }
   else {
      errnum = drmaa_set_attribute (jt, DRMAA_REMOTE_COMMAND, "sleeper.sh",
                                    error, DRMAA_ERROR_STRING_BUFFER);

      if (errnum != DRMAA_ERRNO_SUCCESS) {
         fprintf (stderr, "Could not set attribute \"%s\": %s\n",
                  DRMAA_REMOTE_COMMAND, error);
      }
      else {
         const char *args[2] = {"60", NULL};
         
         errnum = drmaa_set_vector_attribute (jt, DRMAA_V_ARGV, args, error,
                                              DRMAA_ERROR_STRING_BUFFER);
      }
      
      if (errnum != DRMAA_ERRNO_SUCCESS) {
         fprintf (stderr, "Could not set attribute \"%s\": %s\n",
                  DRMAA_REMOTE_COMMAND, error);
      }
      else {
         char jobid[DRMAA_JOBNAME_BUFFER];

         errnum = drmaa_run_job (jobid, DRMAA_JOBNAME_BUFFER, jt, error,
                                 DRMAA_ERROR_STRING_BUFFER);

         if (errnum != DRMAA_ERRNO_SUCCESS) {
            fprintf (stderr, "Could not submit job: %s\n", error);
         }
         else {
            int status = 0;
            
            printf ("Your job has been submitted with id %s\n", jobid);
            
            sleep (20);
            
            errnum = drmaa_job_ps (jobid, &status, error,
                                   DRMAA_ERROR_STRING_BUFFER);
            
            if (errnum != DRMAA_ERRNO_SUCCESS) {
               fprintf (stderr, "Could not get job' status: %s\n", error);
            }
            else {
               switch (status) {
                  case DRMAA_PS_UNDETERMINED:
                     printf ("Job status cannot be determined\n");
                     break;
                  case DRMAA_PS_QUEUED_ACTIVE:
                     printf ("Job is queued and active\n");
                     break;
                  case DRMAA_PS_SYSTEM_ON_HOLD:
                     printf ("Job is queued and in system hold\n");
                     break;
                  case DRMAA_PS_USER_ON_HOLD:
                     printf ("Job is queued and in user hold\n");
                     break;
                  case DRMAA_PS_USER_SYSTEM_ON_HOLD:
                     printf ("Job is queued and in user and system hold\n");
                     break;
                  case DRMAA_PS_RUNNING:
                     printf ("Job is running\n");
                     break;
                  case DRMAA_PS_SYSTEM_SUSPENDED:
                     printf ("Job is system suspended\n");
                     break;
                  case DRMAA_PS_USER_SUSPENDED:
                     printf ("Job is user suspended\n");
                     break;
                  case DRMAA_PS_USER_SYSTEM_SUSPENDED:
                     printf ("Job is user and system suspended\n");
                     break;
                  case DRMAA_PS_DONE:
                     printf ("Job finished normally\n");
                     break;
                  case DRMAA_PS_FAILED:
                     printf ("Job finished, but failed\n");
                     break;
               } /* switch */
            } /* else */
         } /* else */
      } /* else */

      errnum = drmaa_delete_job_template (jt, error, DRMAA_ERROR_STRING_BUFFER);

      if (errnum != DRMAA_ERRNO_SUCCESS) {
         fprintf (stderr, "Could not delete job template: %s\n", error);
      }
   } /* else */

   errnum = drmaa_exit (error, DRMAA_ERROR_STRING_BUFFER);

   if (errnum != DRMAA_ERRNO_SUCCESS) {
      fprintf (stderr, "Could not shut down the DRMAA library: %s\n", error);
      return 1;
   }

   return 0;
}
Пример #5
0
void *run(void *arg) {
    int ret = DRMAA_ERRNO_SUCCESS;
    char error[DRMAA_ERROR_STRING_BUFFER + 1];
    
    drmaa_job_template_t *jt = NULL;
    int run = ((int *)arg)[0];
    int thread = ((int *)arg)[1];
    char jobid[DRMAA_JOBNAME_BUFFER + 1];
    int queued = 1;
    int running = 0;
    int status = -1;
    
    free(arg);
    
    ret = drmaa_allocate_job_template(&jt, error, DRMAA_ERROR_STRING_BUFFER);
    if (handle_code(ret, error, run, thread) == 1) {
        return NULL;
    }
    
    ret = drmaa_set_attribute(jt, DRMAA_REMOTE_COMMAND, CMD, error,
    DRMAA_ERROR_STRING_BUFFER);
    if (handle_code(ret, error, run, thread) == 1) {
        return NULL;
    }
    
    ret = drmaa_set_attribute(jt, DRMAA_WD, WD, error,
    DRMAA_ERROR_STRING_BUFFER);
    if (handle_code(ret, error, run, thread) == 1) {
        return NULL;
    }
    
    ret = drmaa_set_attribute(jt, DRMAA_JOB_CATEGORY, CATEGORY, error,
    DRMAA_ERROR_STRING_BUFFER);
    if (handle_code(ret, error, run, thread) == 1) {
        return NULL;
    }
    
    printf("%d %d SETUP complete %ld\n", run, thread, time(NULL));
    
    ret = drmaa_run_job(jobid, DRMAA_JOBNAME_BUFFER, jt, error,
    DRMAA_ERROR_STRING_BUFFER);
    if (handle_code(ret, error, run, thread) == 1) {
        return NULL;
    }
    
    printf("%d %d SUBMITTED jobid: %s %ld\n", run, thread, jobid, time(NULL));
    
    ret = drmaa_delete_job_template(jt, error, DRMAA_ERROR_STRING_BUFFER);
    handle_code(ret, error, run, thread);
    
    while (queued) {
        ret = drmaa_wait(jobid, NULL, 0, NULL, 2, NULL, error,
        DRMAA_ERROR_STRING_BUFFER);
        
        if (ret != DRMAA_ERRNO_EXIT_TIMEOUT) {
            if (handle_code(ret, error, run, thread) == 1) {
                return NULL;
            }
        }
        else {
            printf ("%d %d TIMEOUT jobid: %s %ld\n", run, thread, jobid, time (NULL));
        }
        
        ret = drmaa_job_ps(jobid, &status, error, DRMAA_ERROR_STRING_BUFFER);
        if (handle_code(ret, error, run, thread) == 1) {
            return NULL;
        }
        
        queued = (status == DRMAA_PS_QUEUED_ACTIVE) ||
        (status == DRMAA_PS_SYSTEM_ON_HOLD) ||
        (status == DRMAA_PS_SYSTEM_ON_HOLD) ||
        (status == DRMAA_PS_USER_ON_HOLD) ||
        (status == DRMAA_PS_USER_SYSTEM_ON_HOLD);
    }
    
    printf("%d %d RUNNING jobid: %s %ld\n", run, thread, jobid, time(NULL));
    
    running = 1;
    
    while (running == 1) {
        ret = drmaa_wait(jobid, NULL, 0, NULL, 60, NULL, error,
                         DRMAA_ERROR_STRING_BUFFER);

        if (ret != DRMAA_ERRNO_EXIT_TIMEOUT) {
            if (handle_code(ret, error, run, thread) == 1) {
                return NULL;
            }
            
            running = 0;
            
            printf("%d %d FINISHED jobid: %s %ld\n", run, thread, jobid,
            time(NULL));
        }
        else {
            printf ("%d %d TIMEOUT jobid: %s %ld\n", run, thread, jobid, time (NULL));

            ret = drmaa_job_ps(jobid, &status, error, DRMAA_ERROR_STRING_BUFFER);
            
            if (handle_code(ret, error, run, thread) == 1) {
                return NULL;
            }
            
            if (status != DRMAA_PS_RUNNING) {
                running = 0;
                
                printf("%d %d HUNG jobid: %s %ld\n", run, thread, jobid, time(NULL));
            }
        }
    }
    
    return NULL;
}