Beispiel #1
0
int 
main(int argc, char **argv) 
{
   lList *opts_cmdline = NULL;
   lList *opts_defaults = NULL;
   lList *opts_scriptfile = NULL;
   lList *opts_all = NULL;
   lListElem *job = NULL;
   lList *alp = NULL;
   lListElem *ep;
   int exit_status = 0;
   int just_verify;
   int tmp_ret;
   int wait_for_job = 0, is_immediate = 0;
   dstring session_key_out = DSTRING_INIT;
   dstring diag = DSTRING_INIT;
   dstring jobid = DSTRING_INIT;
   u_long32 start, end, step;
   u_long32 num_tasks;
   int count, stat;
   char *jobid_string = NULL;
   bool has_terse;
   drmaa_attr_values_t *jobids = NULL;

   u_long32 prog_number = 0;
   u_long32 myuid = 0;
   const char *sge_root = NULL;
   const char *cell_root = NULL;
   const char *username = NULL;
   const char *qualified_hostname = NULL;
   const char *unqualified_hostname = NULL;
   const char *mastername = NULL;

   DENTER_MAIN(TOP_LAYER, "qsub");

   prof_mt_init();

   /* Set up the program information name */
   sge_setup_sig_handlers(QSUB);

   DPRINTF(("Initializing JAPI\n"));

   if (japi_init(NULL, NULL, NULL, QSUB, false, NULL, &diag)
                                                      != DRMAA_ERRNO_SUCCESS) {
      fprintf(stderr, "\n");
      fprintf(stderr, MSG_QSUB_COULDNOTINITIALIZEENV_S,
              sge_dstring_get_string(&diag));
      fprintf(stderr, "\n");
      DEXIT;
      SGE_EXIT((void**)&ctx, 1);
   }

   prog_number = ctx->get_who(ctx);
   myuid = ctx->get_uid(ctx);
   sge_root = ctx->get_sge_root(ctx);
   cell_root = ctx->get_cell_root(ctx);
   username = ctx->get_username(ctx);
   qualified_hostname = ctx->get_qualified_hostname(ctx);
   unqualified_hostname = ctx->get_unqualified_hostname(ctx);
   mastername = ctx->get_master(ctx, false);

   /*
    * read switches from the various defaults files
    */
   opt_list_append_opts_from_default_files(prog_number, cell_root, username, &opts_defaults, &alp, environ);
   tmp_ret = answer_list_print_err_warn(&alp, NULL, NULL, MSG_WARNING);
   if (tmp_ret > 0) {
      DEXIT;
      SGE_EXIT((void**)&ctx, tmp_ret);
   }

   /*
    * append the commandline switches to the list
    */
   opt_list_append_opts_from_qsub_cmdline(prog_number, &opts_cmdline, &alp,
                                          argv + 1, environ);
   tmp_ret = answer_list_print_err_warn(&alp, NULL, "qsub: ", MSG_QSUB_WARNING_S);
   if (tmp_ret > 0) {
      DEXIT;
      SGE_EXIT((void**)&ctx, tmp_ret);
   }

   /*
    * show usage if -help was in commandline
    */
   if (opt_list_has_X(opts_cmdline, "-help")) {
      sge_usage(QSUB, stdout);
      DEXIT;
      SGE_EXIT((void**)&ctx, 0);
   }

   /*
    * We will only read commandline options from scripfile if the script
    * itself should not be handled as binary
    */
   if (opt_list_is_X_true(opts_cmdline, "-b") ||
       (!opt_list_has_X(opts_cmdline, "-b") &&
        opt_list_is_X_true(opts_defaults, "-b"))) {
      DPRINTF(("Skipping options from script due to -b option\n"));
   } else {
      opt_list_append_opts_from_script(prog_number,
                                       &opts_scriptfile, &alp, 
                                       opts_cmdline, environ);
      tmp_ret = answer_list_print_err_warn(&alp, NULL, MSG_QSUB_COULDNOTREADSCRIPT_S,
                                           MSG_WARNING);
      if (tmp_ret > 0) {
         DEXIT;
         SGE_EXIT((void**)&ctx, tmp_ret);
      }
   }

   /*
    * Merge all commandline options and interprete them
    */
   opt_list_merge_command_lines(&opts_all, &opts_defaults, 
                                &opts_scriptfile, &opts_cmdline);

   /*
    * Check if -terse is requested
    */
   has_terse = opt_list_has_X(opts_all, "-terse");

   /* If "-sync y" is set, wait for the job to end. */   
   /* Remove all -sync switches since cull_parse_job_parameter()
    * doesn't know what to do with them. */
   while ((ep = lGetElemStr(opts_all, SPA_switch, "-sync"))) {
      if (lGetInt(ep, SPA_argval_lIntT) == TRUE) {
         wait_for_job = 1;
      }
      
      lRemoveElem(opts_all, &ep);
   }

   if (wait_for_job) {
      DPRINTF(("Wait for job end\n"));
   }

   alp = cull_parse_job_parameter(myuid, username, cell_root, unqualified_hostname, 
                                  qualified_hostname, opts_all, &job);

   tmp_ret = answer_list_print_err_warn(&alp, NULL, "qsub: ", MSG_WARNING);
   if (tmp_ret > 0) {
      DEXIT;
      SGE_EXIT((void**)&ctx, tmp_ret);
   }

   if (set_sec_cred(sge_root, mastername, job, &alp) != 0) {
      answer_list_output(&alp);
      DEXIT;
      SGE_EXIT((void**)&ctx, 1);
   }

   /* Check if job is immediate */
   is_immediate = (int)JOB_TYPE_IS_IMMEDIATE(lGetUlong(job, JB_type));
   DPRINTF(("Job is%s immediate\n", is_immediate ? "" : " not"));

   DPRINTF(("Everything ok\n"));

   if (lGetUlong(job, JB_verify)) {
      cull_show_job(job, 0, false);
      DEXIT;
      SGE_EXIT((void**)&ctx, 0);
   }

   if (is_immediate || wait_for_job) {
      pthread_t sigt;
      
      qsub_setup_sig_handlers(); 

      if (pthread_create(&sigt, NULL, sig_thread, (void *)NULL) != 0) {
         fprintf(stderr, "\n");
         fprintf(stderr, MSG_QSUB_COULDNOTINITIALIZEENV_S,
                 " error preparing signal handling thread");
         fprintf(stderr, "\n");
         
         exit_status = 1;
         goto Error;
      }
      
      if (japi_enable_job_wait(username, unqualified_hostname, NULL, &session_key_out, error_handler, &diag) ==
                                       DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE) {
         const char *msg = sge_dstring_get_string(&diag);
         fprintf(stderr, "\n");
         fprintf(stderr, MSG_QSUB_COULDNOTINITIALIZEENV_S,
                 msg?msg:" error starting event client thread");
         fprintf(stderr, "\n");
         
         exit_status = 1;
         goto Error;
      }
   }
   
   job_get_submit_task_ids(job, &start, &end, &step);
   num_tasks = (end - start) / step + 1;

   if (num_tasks > 1) {
      int error = japi_run_bulk_jobs(&jobids, &job, start, end, step, &diag);
      if (error != DRMAA_ERRNO_SUCCESS) {
         /* No active session here means that japi_enable_job_wait() was
          * interrupted by the signal handler, in which case we just break out
          * quietly. */
         if (error != DRMAA_ERRNO_NO_ACTIVE_SESSION) {
            fprintf(stderr, MSG_QSUB_COULDNOTRUNJOB_S,
                    sge_dstring_get_string(&diag));
            fprintf(stderr, "\n");
         }
         
         /* BUGFIX: Issuezilla #1013
          * To quickly fix this issue, I'm mapping the JAPI/DRMAA error code
          * back into a GDI error code.  This is the easy solution.  The
          * correct solution would be to address issue #859, presumably by
          * having JAPI reuse the GDI error codes instead of the JAPI error
          * codes. */
         if (error == DRMAA_ERRNO_TRY_LATER) {
            exit_status = STATUS_NOTOK_DOAGAIN;
         }
         else {
            exit_status = 1;
         }
         
         goto Error;
      }

      DPRINTF(("job id is: %ld\n", jobids->it.ji.jobid));
      
      jobid_string = get_bulk_jobid_string((long)jobids->it.ji.jobid, start, end, step);
   }
   else if (num_tasks == 1) {
      int error = japi_run_job(&jobid, &job, &diag);
      
      if (error != DRMAA_ERRNO_SUCCESS) {
         if (error != DRMAA_ERRNO_NO_ACTIVE_SESSION) {
            fprintf(stderr, MSG_QSUB_COULDNOTRUNJOB_S,
                    sge_dstring_get_string(&diag));
            fprintf(stderr, "\n");
         }
         
         /* BUGFIX: Issuezilla #1013
          * To quickly fix this issue, I'm mapping the JAPI/DRMAA error code
          * back into a GDI error code.  This is the easy solution.  The
          * correct solution would be to address issue #859, presumably by
          * having JAPI reuse the GDI error codes instead of the DRMAA error
          * codes. */
         if (error == DRMAA_ERRNO_TRY_LATER) {
            exit_status = STATUS_NOTOK_DOAGAIN;
         }
         else {
            exit_status = 1;
         }
         
         goto Error;
      }

      jobid_string = strdup(sge_dstring_get_string(&jobid));
      DPRINTF(("job id is: %s\n", jobid_string));

      sge_dstring_free(&jobid);
   }
   else {
      fprintf(stderr, MSG_QSUB_COULDNOTRUNJOB_S, "invalid task structure");
      fprintf(stderr, "\n");
      
      exit_status = 1;
      goto Error;
   }
  
   /* only success message is printed to stdout */

   just_verify = (lGetUlong(job, JB_verify_suitable_queues)==JUST_VERIFY || 
                  lGetUlong(job, JB_verify_suitable_queues)==POKE_VERIFY);
   DPRINTF(("Just verifying job\n"));

   if (!just_verify) {
      const char *output = sge_dstring_get_string(&diag); 

      /* print the tersed output */
      if (has_terse) {
         printf("%s", jobid_string);
      } else if (output != NULL) {
        printf("%s", output);
      } else {
        printf(MSG_QSUB_YOURJOBHASBEENSUBMITTED_SS, jobid_string, lGetString(job, JB_job_name));
      }
      printf("\n");
   } else {
      printf(MSG_JOB_VERIFYFOUNDQ);
      printf("\n");
   }   

   if ((wait_for_job || is_immediate) && !just_verify) {
      int event;

      if (is_immediate) {
         fprintf(stderr, "%s\n", MSG_QSUB_WAITINGFORIMMEDIATEJOBTOBESCHEDULED);

         /* We only need to wait for the first task to be scheduled to be able
          * to say that the job is running. */
         tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat,
                             DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_START, &event,
                             NULL, &diag);

         if ((tmp_ret == DRMAA_ERRNO_SUCCESS) && (event == JAPI_JOB_START)) {
            fprintf(stderr, "\n");
            fprintf(stderr, MSG_QSUB_YOURIMMEDIATEJOBXHASBEENSUCCESSFULLYSCHEDULED_S,
                  jobid_string);
            fprintf(stderr, "\n");
         }
         /* A job finish event here means that the job was rejected. */
         else if ((tmp_ret == DRMAA_ERRNO_SUCCESS) &&
                  (event == JAPI_JOB_FINISH)) {
            fprintf(stderr, "\n%s\n", MSG_QSUB_YOURQSUBREQUESTCOULDNOTBESCHEDULEDDTRYLATER);
            
            exit_status = 1;
            goto Error;
         }
         else {
         /* Since we told japi_wait to wait forever, we know that if it gets
          * a timeout, it's because it's been interrupted to exit, in which
          * case we don't complain.  Same for no active session. */
            if ((tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT) &&
                (tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION)) {
               fprintf(stderr, "\n");
               fprintf(stderr, MSG_QSUB_COULDNOTWAITFORJOB_S,
                       sge_dstring_get_string(&diag));
               fprintf(stderr, "\n");
            }

            exit_status = 1;
            goto Error;
         }
      }
         
      if (wait_for_job) {
         /* Rather than using japi_synchronize on ALL for bulk jobs, we use
          * japi_wait on ANY num_tasks times because with synchronize, we would
          * have to wait for all the tasks to finish before we know if any
          * finished. */
         for (count = 0; count < num_tasks; count++) {
            /* Since there's only one running job in the session, we can just
             * wait for ANY. */
            if ((tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat,
                          DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_FINISH, &event,
                          NULL, &diag)) != DRMAA_ERRNO_SUCCESS) {
               if ((tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT) &&
                   (tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION)) {
                  fprintf(stderr, "\n");
                  fprintf(stderr, MSG_QSUB_COULDNOTWAITFORJOB_S, sge_dstring_get_string(&diag));
                  fprintf(stderr, "\n");
               }
               
               exit_status = 1;
               goto Error;
            }
            
            /* report how job finished */
            /* If the job is an array job, use the first non-zero exit code as
             * the exit code for qsub. */
            if (exit_status == 0) {
               exit_status = report_exit_status(stat,
                                              sge_dstring_get_string(&jobid));
            }
            /* If we've already found a non-zero exit code, just print the exit
             * info for the task. */
            else {
               report_exit_status(stat, sge_dstring_get_string(&jobid));
            }               
         }
      }
   }

Error:
   FREE(jobid_string);
   lFreeList(&alp);
   lFreeList(&opts_all);
   
   if ((tmp_ret = japi_exit(JAPI_EXIT_NO_FLAG, &diag)) != DRMAA_ERRNO_SUCCESS) {
      if (tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION) {
         fprintf(stderr, "\n");
         fprintf(stderr, MSG_QSUB_COULDNOTFINALIZEENV_S, sge_dstring_get_string(&diag));
         fprintf(stderr, "\n");
      }
      else {
         struct timespec ts;
         /* We know that if we get a DRMAA_ERRNO_NO_ACTIVE_SESSION here, it's
          * because the signal handler thread called japi_exit().  We know this
          * because if the call to japi_init() fails, we just exit directly.
          * If the call to japi_init() succeeds, then we have an active session,
          * so coming here because of an error would not result in the
          * DRMAA_ERRNO_NO_ACTIVE_SESSION error. */
         DPRINTF(("Sleeping for 15 seconds to wait for the exit to finish.\n"));
         
         sge_relative_timespec(15, &ts);
         sge_mutex_lock("qsub_exit_mutex", SGE_FUNC, __LINE__, &exit_mutex);
         
         while (!exited) {
            if (pthread_cond_timedwait(&exit_cv, &exit_mutex, &ts) == ETIMEDOUT) {
               DPRINTF(("Exit has not finished after 15 seconds.  Exiting.\n"));
               break;
            }
         }
         
         sge_mutex_unlock("qsub_exit_mutex", SGE_FUNC, __LINE__, &exit_mutex);
      }
   }

   sge_prof_cleanup();

   /* This is an exit() instead of an SGE_EXIT() because when the qmaster is
    * supended, SGE_EXIT() hangs. */
   exit(exit_status);
   DEXIT;
   return exit_status;
}
Beispiel #2
0
static bool 
sge_parse_qrstat(sge_gdi_ctx_class_t *ctx, lList **answer_list,
                 qrstat_env_t *qrstat_env, lList **cmdline)
{
   bool ret = true;
   
   DENTER(TOP_LAYER, "sge_parse_qrstat");

   qrstat_env->is_summary = true;
   while (lGetNumberOfElem(*cmdline)) {
      u_long32 value;
   
      /* -help */
      if (opt_list_has_X(*cmdline, "-help")) {
         sge_usage(QRSTAT, stdout);
         DEXIT;
         SGE_EXIT((void**)&ctx, 0);
      }

      /* -u */
      while (parse_multi_stringlist(cmdline, "-u", answer_list, 
                                    &(qrstat_env->user_list), ST_Type, ST_name)) {
         continue;
      }

      /* -explain */
      while (parse_flag(cmdline, "-explain", answer_list, &value)) {
         qrstat_filter_add_core_attributes(qrstat_env);
         qrstat_filter_add_explain_attributes(qrstat_env);
         qrstat_env->is_explain = (value > 0) ? true : false;
         continue;
      }

      /* -xml */
      while (parse_flag(cmdline, "-xml", answer_list, &value)) {
         qrstat_filter_add_core_attributes(qrstat_env);
         qrstat_filter_add_xml_attributes(qrstat_env);
         qrstat_env->is_xml = (value > 0) ? true : false;
         continue;
      }

      /* -ar */
      while (parse_u_longlist(cmdline, "-ar", answer_list, &(qrstat_env->ar_id_list))) {         
         qrstat_filter_add_core_attributes(qrstat_env);
         qrstat_filter_add_ar_attributes(qrstat_env);
         qrstat_filter_add_ar_where(qrstat_env);
         qrstat_env->is_summary = false;
         continue;      
      }

      if (lGetNumberOfElem(*cmdline)) {
         sge_usage(QRSTAT, stdout);
         answer_list_add(answer_list, MSG_PARSE_TOOMANYOPTIONS, 
                         STATUS_ESEMANTIC, ANSWER_QUALITY_ERROR);
         ret = false;
         break;
      }
   } 

   if (qrstat_env->is_summary) {
      char user[128] = "";
      if (sge_uid2user(geteuid(), user, sizeof(user), MAX_NIS_RETRIES)) {
         answer_list_add_sprintf(answer_list, STATUS_ESEMANTIC,
                                 ANSWER_QUALITY_CRITICAL,
                                 MSG_SYSTEM_RESOLVEUSER_U, (u_long32)geteuid());
         ret = false;
      } else {
         str_list_transform_user_list(&(qrstat_env->user_list), answer_list, user);
         qrstat_filter_add_core_attributes(qrstat_env);
         qrstat_filter_add_u_where(qrstat_env);
      }
   }

   DRETURN(ret);
}
Beispiel #3
0
int main(int argc, char **argv) {
   int ret = STATUS_OK;
   lList *alp = NULL;
   lList *request_list = NULL;
   lList *cmdline = NULL;
   lListElem *aep;
   int all_jobs = 0;
   int all_users = 0;
   u_long32 gdi_cmd = SGE_GDI_MOD; 
   int tmp_ret;
   int me_who;
   sge_gdi_ctx_class_t *ctx = NULL;

   DENTER_MAIN(TOP_LAYER, "qalter");

   prof_mt_init();

   /*
   ** get command name: qalter or qresub
   */
   if (!strcmp(sge_basename(argv[0], '/'), "qresub")) {
      DPRINTF(("QRESUB\n"));
      me_who = QRESUB;
   } else if (!strcmp(sge_basename(argv[0], '/'), "qhold")) {
      DPRINTF(("QHOLD\n"));
      me_who = QHOLD;
   } else if (!strcmp(sge_basename(argv[0], '/'), "qrls")) {
      DPRINTF(("QRLS\n"));
      me_who = QRLS;
   } else {
      DPRINTF(("QALTER\n"));
      me_who = QALTER;
   } 

   log_state_set_log_gui(1);
   sge_setup_sig_handlers(me_who);

   if (sge_gdi2_setup(&ctx, me_who, MAIN_THREAD, &alp) != AE_OK) {
      answer_list_output(&alp);
      SGE_EXIT((void**)&ctx, 1);
   }

   /*
   ** begin to work
   */
   opt_list_append_opts_from_qalter_cmdline(me_who, &cmdline, &alp, argv + 1, environ);
   tmp_ret = answer_list_print_err_warn(&alp, MSG_QALTER, MSG_QALTER, MSG_QALTERWARNING);
   
   if (tmp_ret > 0) {
      SGE_EXIT((void**)&ctx, tmp_ret);
   }
   
   /* handling the case that no command line parameter was specified */
   if ((me_who == QHOLD || me_who == QRLS) && lGetNumberOfElem(cmdline) == 1) {
      /* -h option is set implicitly for QHOLD and QRLS */
      sge_usage(me_who, stderr);
      fprintf(stderr, "%s\n", MSG_PARSE_NOOPTIONARGUMENT);
      SGE_EXIT((void**)&ctx, 1);
   } else if ((me_who == QRESUB || me_who == QALTER) && lGetNumberOfElem(cmdline) == 0) {
      /* qresub and qalter have nothing set */ 
      sge_usage(me_who, stderr);
      fprintf(stderr, "%s\n", MSG_PARSE_NOOPTIONARGUMENT);
      SGE_EXIT((void**)&ctx, 1);
   } else if (opt_list_has_X(cmdline, "-help")) {
      /* -help was specified */
      sge_usage(me_who, stdout);
      SGE_EXIT((void**)&ctx, 0);
   }
   
   alp = qalter_parse_job_parameter(me_who, cmdline, &request_list, &all_jobs, 
                                    &all_users);

   DPRINTF(("all_jobs = %d, all_user = %d\n", all_jobs, all_users));

   if (request_list && verify) {
      /* 
         got a request list containing one element 
         for each job to be modified 
         save jobid all fields contain the same fields
         so we may use show_job() with the first job
         in our list 
         The jobid's in our request list get printed before
         show_job()
      */
      cull_show_job(lFirst(request_list), FLG_QALTER, false);
      sge_prof_cleanup();
      SGE_EXIT((void**)&ctx, 0);
   }

   tmp_ret = answer_list_print_err_warn(&alp, NULL, NULL, MSG_WARNING);
   if (tmp_ret > 0) {
      SGE_EXIT((void**)&ctx, tmp_ret);
   }

   if ((me_who == QALTER) ||
       (me_who == QHOLD) ||
       (me_who == QRLS) 
      ) {
      DPRINTF(("QALTER\n"));
      gdi_cmd = SGE_GDI_MOD;
   } else if (me_who == QRESUB){
      DPRINTF(("QRESUB\n"));
      gdi_cmd = SGE_GDI_COPY;
   } else {
      printf("unknown binary name.\n");
      SGE_EXIT((void**)&ctx, 1);
   }

   if (all_jobs)
      gdi_cmd |= SGE_GDI_ALL_JOBS;
   if (all_users)
      gdi_cmd |= SGE_GDI_ALL_USERS;

   alp = ctx->gdi(ctx, SGE_JB_LIST, gdi_cmd, &request_list, NULL, NULL); 
   for_each (aep, alp) {
      printf("%s\n", lGetString(aep, AN_text));
      if (ret == STATUS_OK) {
         ret = lGetUlong(aep, AN_status);
      }
   }