Пример #1
0
int main(int argc, char *argv[])
{
   char diagnosis[DRMAA_ERROR_STRING_BUFFER];
   const char *all_jobids[NBULKS*JOB_CHUNK + JOB_CHUNK+1];
   char jobid[100];
   int drmaa_errno, i, pos = 0;
   const char *job_path;
   drmaa_job_template_t *jt;
  
   if (argc<2) {
      fprintf(stderr, "usage: example <path-to-job>\n");
      return 1;
   }
   job_path = argv[1];

   if (drmaa_init(NULL, diagnosis, sizeof(diagnosis)-1) != DRMAA_ERRNO_SUCCESS) {
      fprintf(stderr, "drmaa_init() failed: %s\n", diagnosis);
      return 1;
   }

   /*
    *   submit some bulk jobs
    */
   if (!(jt = create_job_template(job_path, 5, 1))) {
      fprintf(stderr, "create_job_template() failed\n");
      return 1;
   }
   for (i=0; i<NBULKS; i++) {
      drmaa_job_ids_t *jobids;
      int j;

      while ((drmaa_errno=drmaa_run_bulk_jobs(&jobids, jt, 1, JOB_CHUNK, 1, diagnosis,
                 sizeof(diagnosis)-1))==DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE) {
         fprintf(stderr, "drmaa_run_bulk_jobs() failed - retry: %s %s\n", diagnosis, drmaa_strerror(drmaa_errno));
         sleep(1);
      } 
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_run_bulk_jobs() failed: %s %s\n", diagnosis, drmaa_strerror(drmaa_errno));
         return 1;
      }

      printf("submitted bulk job with jobids:\n");
      for (j=0; j<JOB_CHUNK; j++) {
         drmaa_get_next_job_id(jobids, jobid, sizeof(jobid)-1);
         all_jobids[pos++] = strdup(jobid);
         printf("\t \"%s\"\n", jobid);
      } 
      drmaa_release_job_ids(jobids);
   }
   drmaa_delete_job_template(jt, NULL, 0);

   /*
    *   submit some sequential jobs
    */
   if (!(jt = create_job_template(job_path, 5, 0))) {
      fprintf(stderr, "create_sleeper_job_template() failed\n");
      return 1;
   }
   for (i=0; i<JOB_CHUNK; i++) {
      while ((drmaa_errno=drmaa_run_job(jobid, sizeof(jobid)-1, jt, diagnosis,
               sizeof(diagnosis)-1)) == DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE) {
         fprintf(stderr, "drmaa_run_job() failed - retry: %s\n", diagnosis);
         sleep(1);
      }
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_run_job() failed: %s\n", diagnosis);
         return 1;
      }
      printf("\t \"%s\"\n", jobid);
      all_jobids[pos++] = strdup(jobid);
   }

   /* set string array end mark */
   all_jobids[pos] = NULL;

   drmaa_delete_job_template(jt, NULL, 0);

   /*
    *   synchronize with all jobs
    */
   drmaa_errno = drmaa_synchronize(all_jobids, DRMAA_TIMEOUT_WAIT_FOREVER, 0, diagnosis, sizeof(diagnosis)-1);
   if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
      fprintf(stderr, "drmaa_synchronize(DRMAA_JOB_IDS_SESSION_ALL, dispose) failed: %s\n", diagnosis);
      return 1;
   }
   printf("synchronized with all jobs\n");

   /*
    *   wait all those jobs
    */
   for (pos=0; pos<NBULKS*JOB_CHUNK + JOB_CHUNK; pos++) {
      int stat;
      int aborted, exited, exit_status, signaled;

      drmaa_errno = drmaa_wait(all_jobids[pos], jobid, sizeof(jobid)-1, 
         &stat, DRMAA_TIMEOUT_WAIT_FOREVER, NULL, diagnosis, sizeof(diagnosis)-1);

      if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_wait(%s) failed: %s\n", all_jobids[pos], diagnosis);
         return 1;
      }

      /*
       * report how job finished 
       */
      drmaa_wifaborted(&aborted, stat, NULL, 0);
      if (aborted)
         printf("job \"%s\" never ran\n", all_jobids[pos]);
      else {
         drmaa_wifexited(&exited, stat, NULL, 0);
         if (exited) {
            drmaa_wexitstatus(&exit_status, stat, NULL, 0);
            printf("job \"%s\" finished regularly with exit status %d\n", 
                  all_jobids[pos], exit_status);
         } else {
            drmaa_wifsignaled(&signaled, stat, NULL, 0);
            if (signaled) {
               char termsig[DRMAA_SIGNAL_BUFFER+1];
               drmaa_wtermsig(termsig, DRMAA_SIGNAL_BUFFER, stat, NULL, 0);
               printf("job \"%s\" finished due to signal %s\n", 
                  all_jobids[pos], termsig);
            } else
               printf("job \"%s\" finished with unclear conditions\n", 
                  all_jobids[pos]);
         }
      }
   }

   if (drmaa_exit(diagnosis, sizeof(diagnosis)-1) != DRMAA_ERRNO_SUCCESS) {
      fprintf(stderr, "drmaa_exit() failed: %s\n", diagnosis);
      return 1;
   }
   
  return 0;
}
Пример #2
0
static void state_monitor(char **ids_a, int chunks_a, char **ids_b, int chunks_b)
{
   int j, status, drmaa_errno;
   int hold = 0, pending = 0, running = 0, done = 0;

   /* wait jobs as to verify they run orderly */
   do {
      char jobid[512];

      drmaa_errno = drmaa_wait(DRMAA_JOB_IDS_SESSION_ANY, jobid, sizeof(jobid)-1, &status, 0, NULL, errorbuf, sizeof(errorbuf)-1);

      if (drmaa_errno==DRMAA_ERRNO_INVALID_JOB) {
         break; /* nothing more to wait for */
      }
      
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS && drmaa_errno != DRMAA_ERRNO_EXIT_TIMEOUT) {
         fprintf(stderr, "drmaa_wait() failed: %s\n", errorbuf);
         exit(1);
      }
      if (drmaa_errno == DRMAA_ERRNO_SUCCESS) {
         int exited, exit_status;

         /* mark the job as done and reaped */
         if (!ids_remove_id(ids_a, BULK_SIZE/chunks_a, jobid))
            ids_remove_id(ids_b, BULK_SIZE/chunks_b, jobid);

         if (do_exit) {
            if (drmaa_wifexited(&exited, status, NULL, 0) != DRMAA_ERRNO_SUCCESS) {
               fprintf(stderr, "drmaa_wifexited(\"%s\") failed\n", jobid);
               exit(1);
            }
            if (!exited) {
               fprintf(stderr, "job \"%s\" didn't exit orderly\n", jobid);
               exit(1);
            }
            if (drmaa_wexitstatus(&exit_status, status, NULL, 0) != DRMAA_ERRNO_SUCCESS) {
               fprintf(stderr, "drmaa_wexitstatus(\"%s\") failed\n", jobid);
               exit(1);
            }
            if (exit_status != 0) {
               fprintf(stderr, "job \"%s\" exit with %d\n", jobid, exit_status);
               exit(1);
            }
         }
      }
   } while (drmaa_errno != DRMAA_ERRNO_EXIT_TIMEOUT);

   if (do_ps) {
      ids_count_status(ids_a, BULK_SIZE/chunks_a, chunks_a, &hold, &pending, &running, &done);
      ids_count_status(ids_b, BULK_SIZE/chunks_b, chunks_b, &hold, &pending, &running, &done);
   } else {
      for (j = 0; j < BULK_SIZE/chunks_a; j++) 
         if (!ids_a[j]) done += chunks_a;
      for (j = 0; j < BULK_SIZE/chunks_b; j++) 
         if (!ids_b[j]) done += chunks_b;
      pending = 2* BULK_SIZE - done;
   }

   for (j=0; j<done; j++) putchar('d');
   for (j=0; j<running; j++) putchar('r');
   for (j=0; j<pending; j++) putchar('p');
   for (j=0; j<hold; j++) putchar('h');
   putchar('\n');
}
Пример #3
0
void *run(void *arg) {
    int ret = DRMAA_ERRNO_SUCCESS;
    char error[DRMAA_ERROR_STRING_BUFFER + 1];
    
    drmaa_job_template_t *jt = NULL;
    int run = ((int *)arg)[0];
    int thread = ((int *)arg)[1];
    char jobid[DRMAA_JOBNAME_BUFFER + 1];
    int queued = 1;
    int running = 0;
    int status = -1;
    
    free(arg);
    
    ret = drmaa_allocate_job_template(&jt, error, DRMAA_ERROR_STRING_BUFFER);
    if (handle_code(ret, error, run, thread) == 1) {
        return NULL;
    }
    
    ret = drmaa_set_attribute(jt, DRMAA_REMOTE_COMMAND, CMD, error,
    DRMAA_ERROR_STRING_BUFFER);
    if (handle_code(ret, error, run, thread) == 1) {
        return NULL;
    }
    
    ret = drmaa_set_attribute(jt, DRMAA_WD, WD, error,
    DRMAA_ERROR_STRING_BUFFER);
    if (handle_code(ret, error, run, thread) == 1) {
        return NULL;
    }
    
    ret = drmaa_set_attribute(jt, DRMAA_JOB_CATEGORY, CATEGORY, error,
    DRMAA_ERROR_STRING_BUFFER);
    if (handle_code(ret, error, run, thread) == 1) {
        return NULL;
    }
    
    printf("%d %d SETUP complete %ld\n", run, thread, time(NULL));
    
    ret = drmaa_run_job(jobid, DRMAA_JOBNAME_BUFFER, jt, error,
    DRMAA_ERROR_STRING_BUFFER);
    if (handle_code(ret, error, run, thread) == 1) {
        return NULL;
    }
    
    printf("%d %d SUBMITTED jobid: %s %ld\n", run, thread, jobid, time(NULL));
    
    ret = drmaa_delete_job_template(jt, error, DRMAA_ERROR_STRING_BUFFER);
    handle_code(ret, error, run, thread);
    
    while (queued) {
        ret = drmaa_wait(jobid, NULL, 0, NULL, 2, NULL, error,
        DRMAA_ERROR_STRING_BUFFER);
        
        if (ret != DRMAA_ERRNO_EXIT_TIMEOUT) {
            if (handle_code(ret, error, run, thread) == 1) {
                return NULL;
            }
        }
        else {
            printf ("%d %d TIMEOUT jobid: %s %ld\n", run, thread, jobid, time (NULL));
        }
        
        ret = drmaa_job_ps(jobid, &status, error, DRMAA_ERROR_STRING_BUFFER);
        if (handle_code(ret, error, run, thread) == 1) {
            return NULL;
        }
        
        queued = (status == DRMAA_PS_QUEUED_ACTIVE) ||
        (status == DRMAA_PS_SYSTEM_ON_HOLD) ||
        (status == DRMAA_PS_SYSTEM_ON_HOLD) ||
        (status == DRMAA_PS_USER_ON_HOLD) ||
        (status == DRMAA_PS_USER_SYSTEM_ON_HOLD);
    }
    
    printf("%d %d RUNNING jobid: %s %ld\n", run, thread, jobid, time(NULL));
    
    running = 1;
    
    while (running == 1) {
        ret = drmaa_wait(jobid, NULL, 0, NULL, 60, NULL, error,
                         DRMAA_ERROR_STRING_BUFFER);

        if (ret != DRMAA_ERRNO_EXIT_TIMEOUT) {
            if (handle_code(ret, error, run, thread) == 1) {
                return NULL;
            }
            
            running = 0;
            
            printf("%d %d FINISHED jobid: %s %ld\n", run, thread, jobid,
            time(NULL));
        }
        else {
            printf ("%d %d TIMEOUT jobid: %s %ld\n", run, thread, jobid, time (NULL));

            ret = drmaa_job_ps(jobid, &status, error, DRMAA_ERROR_STRING_BUFFER);
            
            if (handle_code(ret, error, run, thread) == 1) {
                return NULL;
            }
            
            if (status != DRMAA_PS_RUNNING) {
                running = 0;
                
                printf("%d %d HUNG jobid: %s %ld\n", run, thread, jobid, time(NULL));
            }
        }
    }
    
    return NULL;
}
int main(int argc, char *argv[])
{
   char diagnosis[DRMAA_ERROR_STRING_BUFFER];
   char *s, jobid[100];
   int drmaa_errno, i;
   int ret = 0;
   struct timeval start_s, finish_s, wait_s;
  
   if (argc<2) {
      usage();
      return 1;
   }

   i = 1;
   do {
      if (!strcmp("-help", argv[i]) ||
          !strcmp("-h", argv[i])) {
         usage();
         return 0;

      } else if (!strcmp("-jobs", argv[i])) {
         i++; 
         if (argc < i+1) {
            usage();
            return 1;
         }
         njobs = atoi(argv[i]);
         i++; 

      } else if (!strcmp("-native", argv[i])) {
         i++; 
         if (argc < i+1) {
            usage();
            return 1;
         }
         native_spec = argv[i];
         i++; 

      } else if (!strcmp("-threads", argv[i])) {
         if (argc < i+1) {
            usage();
            return 1;
         }
         i++; 
         nthreads = atoi(argv[i]);
         i++; 

      } else if (!strcmp("-quiet", argv[i])) {
         i++; 
         if (argc < i+1) {
            usage();
            return 1;
         }
         if (!strcmp("yes", argv[i]) || !strcmp("y", argv[i])) 
            quiet = 1;
         else if (!strcmp("no", argv[i]) || !strcmp("n", argv[i])) 
            quiet = 0;
         else {
            usage();
            return 1;
         }
         i++; 

      } else if (!strcmp("-wait", argv[i])) {
         i++; 
         if (argc < i+1) {
            usage();
            return 1;
         }
         if (!strcmp("yes", argv[i]) || !strcmp("y", argv[i])) 
            dowait = 1;
         else if (!strcmp("no", argv[i]) || !strcmp("n", argv[i])) 
            dowait = 0;
         else {
            usage();
            return 1;
         }
         i++; 

      } else if (!strcmp("-scenario", argv[i])) {
         i++; 
         if (argc < i+1) {
            usage();
            return 1;
         }
         s = strchr(argv[i], '.');
         *s = '\0';
         if (strcmp("queue", argv[i]) && strcmp("type", argv[i]) && 
             strcmp("number", argv[i]) && strcmp("pe", argv[i])) {
            usage();
            return 1;
         }
         scenario = strdup(argv[i]);
        
         s++;
         if (strcmp("hostgroup", s) && strcmp("resource", s) && 
             strcmp("none", s) && strcmp("softresource", s) && 
             strcmp("softhostgroup", s)) {
            usage();
            return 1;
         }
         site_b = strdup(s);
         i++; 

      } else {
         job_path = argv[i];
         i++; 
         if (job_path[0]=='-') {
            usage();
            return 1;
         }

         if (argv[i]) {
            job_args = &argv[i];
         }
      }
   } while (i < argc && !job_path);

   if (!job_path) {
      usage();
      return 1;
   }

#if 0
   printf("job_path: \"%s\"\n", job_path);
   printf("njobs:    %d\n", njobs);
   printf("nthreads: %d\n", nthreads);
   printf("native:   %s\n", native_spec);
   printf("dowait:   %s\n", dowait?"yes":"no");
   printf("quiet:    %s\n", quiet?"yes":"no");
   printf("scenario: %s\n", scenario?scenario:"<no such>");
   printf("site_b:   %s\n", site_b?site_b:"<no such>");
   printf("1st arg:  %s\n", job_args?job_args[0]:"<noargs>");
#endif

   if (drmaa_init(NULL, diagnosis, sizeof(diagnosis)-1) != DRMAA_ERRNO_SUCCESS) {
      fprintf(stderr, "drmaa_init() failed: %s\n", diagnosis);
      return 1;
   }

   get_gmt(&start_s);

   if (!scenario) {
      if (!getuid()) {
         fprintf(stderr, "switching to ah114088:gridware\n");
         setegid(339);
         seteuid(115088);
      }

      if (!(jt = create_job_template(job_path, NULL, 0))) {
         fprintf(stderr, "create_sleeper_job_template() failed\n");
         return 1;
      }

      if (nthreads==1) {
         if (submit_jobs(&argv[i]))
             return 1;
      } else {
         pthread_t *ids = NULL;
         ids = (pthread_t *)malloc(sizeof (pthread_t) * nthreads);

         for (i = 0; i < nthreads; i++) {
            if (pthread_create(&ids[i], NULL, submit_jobs, NULL)) {
               fprintf(stderr, "pthread_create() failed: %s\n", strerror(errno));
	       free(ids);
               return 1;
            }
         }

         for (i = 0; i < nthreads; i++) {
            pthread_join(ids[i], NULL);
         }
      }
   
      drmaa_delete_job_template(jt, NULL, 0);

      if (!getuid()) {
         fprintf(stderr, "switching to root:root\n");
         seteuid(0);
         setegid(0);
      }

   } else {
      if (submit_by_project("project1") || submit_by_project("project2") ||
          submit_by_project("project3") || submit_by_project("project4"))
            return 1;
   }

   get_gmt(&finish_s);
   printf("submission took %8.3f seconds\n", DELTA_SECONDS(start_s, finish_s)); 

   if (dowait) {
      int success = 1;

      for (i=0; i<njobs * nthreads; i++) {
         int stat;
         int aborted, exited, exit_status, signaled;

         drmaa_errno = drmaa_wait(DRMAA_JOB_IDS_SESSION_ANY, jobid, sizeof(jobid)-1, 
            &stat, DRMAA_TIMEOUT_WAIT_FOREVER, NULL, diagnosis, sizeof(diagnosis)-1);

         if (drmaa_errno != DRMAA_ERRNO_SUCCESS) {
            fprintf(stderr, "drmaa_wait() failed: %s\n", diagnosis);
            return 1;
         }

         /*
          * report how job finished 
          */
         drmaa_wifaborted(&aborted, stat, NULL, 0);
         if (aborted) {
            printf("job \"%s\" never ran\n", jobid);
            success = 0;
         } else {
            drmaa_wifexited(&exited, stat, NULL, 0);
            if (exited) {
               drmaa_wexitstatus(&exit_status, stat, NULL, 0);
               if (exit_status != 0) {
                  success = 0;
                  printf("job \"%s\" with exit status %d\n", jobid, exit_status);
               } else {
                  if (!quiet)
                     printf("job \"%s\" finished regularly\n", jobid);
               }
            } else {
               success = 0;
               drmaa_wifsignaled(&signaled, stat, NULL, 0);
               if (signaled) {
                  char termsig[DRMAA_SIGNAL_BUFFER+1];
                  drmaa_wtermsig(termsig, DRMAA_SIGNAL_BUFFER, stat, NULL, 0);
                  printf("job \"%s\" finished due to signal %s\n", jobid, termsig);
               } else
                  printf("job \"%s\" finished with unclear conditions\n", jobid);
            }
         }
      }

      if (!success)
         ret = 1;

      get_gmt(&wait_s);
      printf("wait took %8.3f seconds\n", DELTA_SECONDS(finish_s, wait_s)); 
      printf("jobs took %8.3f seconds\n", DELTA_SECONDS(start_s, wait_s)); 
   }

   if (drmaa_exit(diagnosis, sizeof(diagnosis)-1) != DRMAA_ERRNO_SUCCESS) {
      fprintf(stderr, "drmaa_exit() failed: %s\n", diagnosis);
      return 1;
   }
   
  return ret;
}
Пример #5
0
int main(int argc, char *argv[])
{
    char                   error[DRMAA_ERROR_STRING_BUFFER];
    int                    result;
    drmaa_job_template_t * jt;
    char                   job_id[DRMAA_JOBNAME_BUFFER];
    char                   job_id_out[DRMAA_JOBNAME_BUFFER];    
    drmaa_attr_values_t *  rusage;
    int                    stat;    
    char                   attr_value[DRMAA_ATTR_BUFFER];
                                              
    result = drmaa_init (NULL, error, DRMAA_ERROR_STRING_BUFFER-1);

    if ( result != DRMAA_ERRNO_SUCCESS)
    {
      fprintf(stderr,"drmaa_init() failed: %s\n", error);
      return -1;
    }
    else
      printf("drmaa_init() success \n");
      
          
    setup_job_template(&jt);

    result = drmaa_run_job(job_id, 
                           DRMAA_JOBNAME_BUFFER-1, 
                           jt, 
                           error, 
       			   DRMAA_ERROR_STRING_BUFFER-1);
            			               
    if ( result != DRMAA_ERRNO_SUCCESS) 
    {
        fprintf(stderr,"drmaa_run_job() failed: %s\n", error);
        return -1;    
    }
    
    fprintf(stderr,"Job successfully submitted ID: %s\n",job_id);
        
	result = drmaa_wait(job_id,
	                    job_id_out, 
	                    DRMAA_JOBNAME_BUFFER-1, 
                        &stat, 
                        DRMAA_TIMEOUT_WAIT_FOREVER, 
                        &rusage, 
                        error, 
                        DRMAA_ERROR_STRING_BUFFER-1);

    if ( result != DRMAA_ERRNO_SUCCESS) 
    {
        fprintf(stderr,"drmaa_wait() failed: %s\n", error);
        return -1;    
    }
    
    drmaa_wexitstatus(&stat,stat,error,DRMAA_ERROR_STRING_BUFFER);
    
    fprintf(stderr,"Job finished with exit code %i, usage: %s\n",stat,job_id);

	while ( drmaa_get_next_attr_value(rusage,attr_value,DRMAA_ATTR_BUFFER-1) !=
			DRMAA_ERRNO_NO_MORE_ELEMENTS)
		fprintf(stderr,"\t%s\n",attr_value);		
    
    drmaa_release_attr_values (rusage);    
	    
	/* ---- Finalize ---- */
    
    result = drmaa_delete_job_template(jt, 
                                   error, 
                                   DRMAA_ERROR_STRING_BUFFER-1);

    if ( result != DRMAA_ERRNO_SUCCESS)
    {
      fprintf(stderr,"drmaa_delete_job_template() failed: %s\n", error);
      return -1;
    }
                                       
    result = drmaa_exit (error, DRMAA_ERROR_STRING_BUFFER-1);

    if ( result != DRMAA_ERRNO_SUCCESS)
    {
      fprintf(stderr,"drmaa_exit() failed: %s\n", error);
      return -1;
    }
      
    return 0;
}
Пример #6
0
int
main ()
{
  char error[DRMAA_ERROR_STRING_BUFFER];
  int errnum = 0;
  char jobid[DRMAA_JOBNAME_BUFFER];
  char jobid_2[DRMAA_JOBNAME_BUFFER];
  
  /* Init Session */
  /* drmaa_init | oar_connect */
  errnum = drmaa_init ("localhost", error, DRMAA_ERROR_STRING_BUFFER);

  if (errnum != DRMAA_ERRNO_SUCCESS) 
  {
    fprintf (stderr, "Couldn't init DRMAA library: %s\n", error);
    return 1;
  }

  /* Do Stuff */
  /*instance handling*/
  //instance_handling();

  /* Allocate Job Template */
  drmaa_job_template_t *jt = NULL;

  errnum = drmaa_allocate_job_template (&jt, error,DRMAA_ERROR_STRING_BUFFER);
  if (errnum != DRMAA_ERRNO_SUCCESS) 
  {
    fprintf (stderr, "Couldn't allocate job template: %s\n", error);
    return 1;
  }

  char cmd1[]="/bin/sleep";
  char cmd2[]="exit";

  /* Job Templates */
  errnum = drmaa_set_attribute (jt, DRMAA_REMOTE_COMMAND, cmd2, error, DRMAA_ERROR_STRING_BUFFER);
  if (errnum != DRMAA_ERRNO_SUCCESS) 
  {
    fprintf (stderr, "Couldn't set remote command: %s\n", error);
    return 1;
  }

  /* set HOLD state at submission */
  errnum = drmaa_set_attribute(jt, DRMAA_JS_STATE, DRMAA_SUBMISSION_STATE_HOLD, NULL, 0);

  const char *args[2] = {"5", NULL};

  errnum = drmaa_set_vector_attribute (jt, DRMAA_V_ARGV, args, error, DRMAA_ERROR_STRING_BUFFER);
  if (errnum != DRMAA_ERRNO_SUCCESS) 
  {
    fprintf (stderr, "Couldn't set remote command args: %s\n", error);
    return 1;
  }

  /* Run Job */
  errnum = drmaa_run_job (jobid, DRMAA_JOBNAME_BUFFER, jt, error, DRMAA_ERROR_STRING_BUFFER);

  if (errnum != DRMAA_ERRNO_SUCCESS) 
  {
    fprintf (stderr, "Couldn't run job: %s\n", error);
    return 1;
  } else 
  {
    printf ("Your job has been submitted with id %s\n", jobid);
  }

  /* Run Job */
  /*
    errnum = drmaa_run_job (jobid_2, DRMAA_JOBNAME_BUFFER, jt, error, DRMAA_ERROR_STRING_BUFFER);
  */

  /* Get Job State */
  /**
   * The possible values of
   * a program's staus are:
   *   - DRMAA_PS_UNDETERMINED
   *   - DRMAA_PS_QUEUED_ACTIVE
   *   - DRMAA_PS_SYSTEM_ON_HOLD
   *   - DRMAA_PS_USER_ON_HOLD
   *   - DRMAA_PS_USER_SYSTEM_ON_HOLD
   *   - DRMAA_PS_RUNNING
   *   - DRMAA_PS_SYSTEM_SUSPENDED
   *   - DRMAA_PS_USER_SUSPENDED
   *   - DRMAA_PS_DONE
   *   - DRMAA_PS_FAILED
   * Terminated jobs have a status of DRMAA_PS_FAILED.
   */
  /*
  int remote_ps;
  errnum =  drmaa_job_ps(jobid, &remote_ps, error, DRMAA_ERROR_STRING_BUFFER);
  printf("drmaa_job_ps: job_id: %s job_ps: %d\n",jobid,remote_ps);



  errnum =  drmaa_job_ps(jobid, &remote_ps, error, DRMAA_ERROR_STRING_BUFFER);
  printf("2-drmaa_job_ps: job_id: %s job_ps: %d\n",jobid,remote_ps);
*/

  /* Conrol Job */
  /*
  *   - DRMAA_CONTROL_SUSPEND   0
  *   - DRMAA_CONTROL_RESUME    1
  *   - DRMAA_CONTROL_HOLD      2
  *   - DRMAA_CONTROL_RELEASE   3
  *   - DRMAA_CONTROL_TERMINATE 4
  */
  /* Delete Job */
  /*
  int i;
  for(i=0;i<0;i++)
  {
    errnum = drmaa_control(jobid, i, error, DRMAA_ERROR_STRING_BUFFER);
    if (errnum != DRMAA_ERRNO_SUCCESS)
    {
        fprintf (stderr, "Couldn't drmaa_control job: %s\n", error);
    } else
    {
     const char *str = drmaa_control_to_str(i);
     printf("drmaa_control: job_id: %s action: %s\n", jobid, str);
    }
  }
  */

  sleep(7);
  printf("drmaa_wait \n");

  int stat;
  drmaa_wait(DRMAA_JOB_IDS_SESSION_ANY, jobid, sizeof(jobid)-1, &stat, 20, NULL, NULL, 0);
  printf("drmaa_wait JobId: %s\n", jobid);


  /*
  const char **job_ids = NULL;
  job_ids = calloc( 2, sizeof(char *) );
  job_ids[0]=jobid;
  printf("drmaa_synchronize\n");
  errnum = drmaa_synchronize(job_ids, 200, 0,
          error, DRMAA_ERROR_STRING_BUFFER);
*/
  /* TODO free fsb_free_vector(job_ids); */

  if (errnum != DRMAA_ERRNO_SUCCESS)
  {
      fprintf (stderr, "Couldn't drmaa_synchronize: %s\n", error);
  }

   /* Delete Job Template*/

  errnum = drmaa_delete_job_template (jt, error,  DRMAA_ERROR_STRING_BUFFER);
  if (errnum != DRMAA_ERRNO_SUCCESS) 
  {
    fprintf (stderr, "Couldn't delete job template: %s\n", error);
    return 1;
  }


/* Exit Session */

  /* drmaa_exit | oar_disconnect */
  errnum = drmaa_exit (error, DRMAA_ERROR_STRING_BUFFER);

  if (errnum != DRMAA_ERRNO_SUCCESS) 
  {
    fprintf (stderr, "Couldn't exit DRMAA library: %s\n", error);
    return 1;
  }

  return 0;
  
/*
  return drmaa_init("hello");
  return drmaa_exit() ;
  return 0;
*/
}
Пример #7
0
int main(int argc, char *argv[])
{
   const char *filename;
   int ret = 0;
   char diagnosis[DRMAA_ERROR_STRING_BUFFER];
   char jobwd[1024*4];
   char jobid[1024];
   char line[2*1024];
   int drmaa_errno;
   drmaa_job_template_t *jt = NULL;
   int stat;
   int aborted, exited, exit_status, signaled;
   int j, njobs = 0;
   FILE *fp;
   struct sigaction sa;

   /* clear job info structure */
   clear_all_job_info();

   /* setup a signal handler for shutdown */
   memset(&sa, 0, sizeof(sa));
   sa.sa_handler = my_compile_signal_handler;  /* one handler for all signals */
   sigemptyset(&sa.sa_mask);
   sigaction(SIGINT,  &sa, NULL);
   sigaction(SIGTERM, &sa, NULL);
   sigaction(SIGHUP,  &sa, NULL);
   sigaction(SIGPIPE, &sa, NULL);

   /* we can override use of a compile.conf in cwd by environment */
   filename = getenv("RAIMK_COMPILE_CONF");
   if (filename == NULL) {
      filename = "compile.conf";
   }

   /* we'll start the job in the cwd */
   if (!getcwd(jobwd, sizeof(jobwd)-1)) {
      fprintf(stderr, "getcwd() failed: %s\n", strerror(errno));
      ret = 2;
      goto Finish;
   }

   /* try to open config file */
   if (!(fp = fopen(filename, "r"))) {
      fprintf(stderr, "fopen(\"compile.conf\") failed: %s\n", strerror(errno));
      ret = 2;
      goto Finish;
   }

   /* initialize a drmaa session */
   if (drmaa_init(NULL, diagnosis, sizeof(diagnosis)-1) != DRMAA_ERRNO_SUCCESS) {
      fprintf(stderr, "drmaa_init() failed: %s\n", diagnosis);
      fclose(fp);
      return 2;
   }

   printf("--- start cluster session --------------------------------\n");

   /* parse the config file and start a job for every architecture */
   while (fscanf(fp, "%[^\n]\n", line) == 1) {
      char nat_spec[1024];
      char arch[1024];
      char name[1024];
      char ns[1024];
      char output_file[1024];
      char dummy[1024];

      /* skip comment lines */
      if (line[0] == '#')
         continue;

      if (sscanf(line, "%[^\t ]%[\t ]%[^\n]\n", arch, dummy, ns) != 3) {
         fprintf(stderr, "parsing error in compile.conf\n");
         continue;
      }

      sprintf(name, "build %s", arch);

      /* build job template */
      if (drmaa_allocate_job_template(&jt, NULL, 0)!=DRMAA_ERRNO_SUCCESS) {
         fprintf(stderr, "drmaa_run_job() failed: %s\n", diagnosis);
         ret = 2;
         goto Finish;
      }

      drmaa_set_attribute(jt, DRMAA_WD, jobwd, NULL, 0);
      drmaa_set_attribute(jt, DRMAA_REMOTE_COMMAND, argv[1], NULL, 0);
      drmaa_set_attribute(jt, DRMAA_JOIN_FILES, "y", NULL, 0);
      drmaa_set_attribute(jt, DRMAA_JOB_NAME, name, NULL, 0);

      sprintf(nat_spec, "-b no -S /bin/sh %s", ns);
      drmaa_set_attribute(jt, DRMAA_NATIVE_SPECIFICATION, nat_spec, NULL, 0);

      sprintf(output_file, ":%s/build_%s.log", jobwd, arch);
      drmaa_set_attribute(jt, DRMAA_OUTPUT_PATH, output_file, NULL, 0);

      drmaa_set_vector_attribute(jt, DRMAA_V_ARGV, (const char **)&argv[2], NULL, 0);

      /* submit job */
      if ((drmaa_errno=drmaa_run_job(jobid, sizeof(jobid)-1, jt, diagnosis,
               sizeof(diagnosis)-1)) != DRMAA_ERRNO_SUCCESS) {
         drmaa_delete_job_template(jt, NULL, 0);

         if (drmaa_errno == DRMAA_ERRNO_DENIED_BY_DRM) {
            printf("--- job \"%s\" using \"%s\" wasn't accepted: %s\n", name, ns, diagnosis);
            continue;
         }

         fprintf(stderr, "drmaa_run_job() failed: %s\n", diagnosis);
         ret = 2;
         goto Finish;
      }

      /* remember job information */
      job[njobs].jid = strdup(jobid);
      job[njobs].name = strdup(name);
      job[njobs].ns = strdup(ns);
      job[njobs].output_file = strdup(output_file);
      njobs++;

      drmaa_delete_job_template(jt, NULL, 0);
      
      printf("    submitted job \"%s\" as job %s\n", name, jobid);
   }
   fclose(fp);

   /* monitor jobs, until all have finished */
   while (number_of_jobs() > 0) {
      /* We wait with timeout to be able to react on events like CTRL-C */
      drmaa_errno = drmaa_wait(DRMAA_JOB_IDS_SESSION_ANY, jobid, sizeof(jobid)-1, 
                               &stat, 1, NULL, diagnosis, sizeof(diagnosis)-1);

      /* error */
      if (drmaa_errno != DRMAA_ERRNO_SUCCESS && drmaa_errno != DRMAA_ERRNO_EXIT_TIMEOUT) {
         fprintf(stderr, "drmaa_wait() failed: %s\n", diagnosis);
         ret = 2;
         goto Finish;
      }

      /* user pressed CTRL-C: delete all jobs */
      if (terminate_session) {
         printf("--- shutdown requested --------------------------------\n");
         delete_all_jobs();
      }
 
      /* if user pressed CTRL-C multiple times, exit */
      if (terminate_program) {
         printf("--- forced shutdown -----------------------------------\n");
         goto Finish;
      }
  
      /* 
       * a job terminated - evaluate return codes and deregister job from
       * our internal bookkeeping
       */
      if (drmaa_errno == DRMAA_ERRNO_SUCCESS) {
         j = search_job(jobid);
         if (j < 0) {
            fprintf(stderr, "drmaa_wait() returns unknown job ... ?\n");
         }

         /* report how job finished */
         drmaa_wifaborted(&aborted, stat, NULL, 0);
         if (aborted) {
            printf("--- run \"%s\" stopped or never started\n", job[j].name);
         } else {
            int failed = 1;
            char *path = job[j].output_file + 1;

            drmaa_wifexited(&exited, stat, NULL, 0);
            if (exited) {
               drmaa_wexitstatus(&exit_status, stat, NULL, 0);
               if (exit_status == 0) {
                  printf("+++ run \"%s\" was successful\n", job[j].name);
                  failed = 0;
               } else {
                  printf("### run \"%s\" broken ##################################\n", job[j].name);
                  ret = 1;
               }
            } else {
               drmaa_wifsignaled(&signaled, stat, NULL, 0);
               if (signaled) {
                  char termsig[DRMAA_SIGNAL_BUFFER+1];
                  drmaa_wtermsig(termsig, DRMAA_SIGNAL_BUFFER, stat, NULL, 0);
                  printf("job \"%s\" finished due to signal %s\n", job[j].name, termsig);
               } else {
                  printf("job \"%s\" finished with unclear conditions\n", job[j].name);
               }
            }

            /* 
             * If a job succeeded, we delete its output file.
             * If it failed, we show the end of the output file.
             */
            if (failed) {
               char tail_cmd[1024];
               sprintf(tail_cmd, "tail -15 %s", path);
               ret = system(tail_cmd);
            } else {
               if (unlink(path) != 0) {
                  fprintf(stderr, "couldn't unlink \"%s\" job output file %s: %s\n",
                          job[j].name, path, strerror(errno));
               }
            }
         }

         /* clean the job struct */
         clear_job_info(j);
      }
   }

   printf("--- end cluster session --------------------------------\n");

Finish:
   if (drmaa_exit(diagnosis, sizeof(diagnosis)-1) != DRMAA_ERRNO_SUCCESS) {
      fprintf(stderr, "drmaa_exit() failed: %s\n", diagnosis);
      return 1;
   }
   return ret;
}