Exemple #1
0
// if status = running, and current_time > sim_start + max_confirm_wait
// (usually 2 min), check if job is confirmed running (status_file exists).
// If not confirmed, set job to JOB_QUEUE_FAILED.
bool job_queue_node_update_status( job_queue_node_type * node , job_queue_status_type * status , queue_driver_type * driver ) {
  bool status_change = false;
  pthread_mutex_lock(&node->data_mutex);
  {
    if (node->job_data) {
      job_status_type current_status = job_queue_node_get_status(node);

      bool confirmed = job_queue_node_status_update_confirmed_running__(node);

      if ((current_status & JOB_QUEUE_RUNNING) && !confirmed) {
        // it's running, but not confirmed running.
        double runtime = job_queue_node_time_since_sim_start(node);
        if (runtime >= node->max_confirm_wait) {
          // max_confirm_wait has passed since sim_start without success; the job is dead
          job_status_type new_status = JOB_QUEUE_EXIT;
          status_change = job_queue_status_transition(status, current_status, new_status);
          job_queue_node_set_status(node, new_status);
        }
      }
      current_status = job_queue_node_get_status(node);
      if (current_status & JOB_QUEUE_CAN_UPDATE_STATUS) {
        job_status_type new_status = queue_driver_get_status( driver , node->job_data);
        status_change = job_queue_status_transition(status , current_status , new_status);
        job_queue_node_set_status(node,new_status);
      }
    }
  }
  pthread_mutex_unlock( &node->data_mutex );
  return status_change;
}
Exemple #2
0
bool job_queue_node_kill( job_queue_node_type * node , job_queue_status_type * status , queue_driver_type * driver) {
  bool result = false;
  pthread_mutex_lock( &node->data_mutex );
  {
    job_status_type current_status = job_queue_node_get_status( node );
    if (current_status & JOB_QUEUE_CAN_KILL) {
      /*
         Jobs with status JOB_QUEUE_WAITING are killable - in the
         sense that status should be set to JOB_QUEUE_USER_KILLED; but
         they do not have any driver specific job_data, and the
         driver->kill_job() function can NOT be called.
      */
      if (current_status != JOB_QUEUE_WAITING) {
        queue_driver_kill_job( driver , node->job_data );
        if (node->job_data) {
          queue_driver_free_job( driver , node->job_data );
          node->job_data = NULL;
        }
      }
      job_queue_status_transition(status, current_status, JOB_QUEUE_USER_KILLED);
      job_queue_node_set_status( node , JOB_QUEUE_USER_KILLED);
      result = true;
    }
  }
  pthread_mutex_unlock( &node->data_mutex );
  return result;
}
Exemple #3
0
static void job_queue_user_exit__( job_queue_type * queue ) {
  int queue_index;
  for (queue_index = 0; queue_index < job_list_get_size( queue->job_list ); queue_index++) {
    job_queue_node_type * node = job_list_iget_job( queue->job_list , queue_index );

    if (JOB_QUEUE_CAN_KILL & job_queue_node_get_status(node))
      job_queue_node_status_transition(node,queue->status,JOB_QUEUE_DO_KILL);
  }
}
Exemple #4
0
void job_queue_node_restart( job_queue_node_type * node , job_queue_status_type * status) {
  pthread_mutex_lock( &node->data_mutex );
  {
    job_status_type current_status = job_queue_node_get_status( node );
    job_queue_status_transition(status, current_status, JOB_QUEUE_WAITING);
    job_queue_node_set_status( node , JOB_QUEUE_WAITING);
    job_queue_node_reset_submit_attempt(node);
  }
  pthread_mutex_unlock( &node->data_mutex );
}
Exemple #5
0
bool job_queue_node_status_transition( job_queue_node_type * node , job_queue_status_type * status , job_status_type new_status) {
  bool status_change;
  pthread_mutex_lock( &node->data_mutex );
  {
    job_status_type old_status = job_queue_node_get_status( node );
    status_change = job_queue_status_transition(status , old_status, new_status);

    if (status_change)
      job_queue_node_set_status( node , new_status );
  }
  pthread_mutex_unlock( &node->data_mutex );
  return status_change;
}
Exemple #6
0
bool job_queue_node_update_status( job_queue_node_type * node , job_queue_status_type * status , queue_driver_type * driver) {
  bool status_change = false;
  pthread_mutex_lock( &node->data_mutex );
  {
    if (node->job_data) {
      job_status_type current_status = job_queue_node_get_status(node);
      if (current_status & JOB_QUEUE_CAN_UPDATE_STATUS) {
        job_status_type new_status = queue_driver_get_status( driver , node->job_data);
        status_change = job_queue_status_transition(status , current_status , new_status);
        job_queue_node_set_status(node,new_status);
      }
    }
  }
  pthread_mutex_unlock( &node->data_mutex );
  return status_change;
}
Exemple #7
0
static void job_queue_check_expired(job_queue_type * queue) {
  if ((job_queue_get_max_job_duration(queue) <= 0) && (job_queue_get_job_stop_time(queue) <= 0))
    return;

  for (int i = 0; i < job_list_get_size( queue->job_list ); i++) {
    job_queue_node_type * node = job_list_iget_job( queue->job_list , i );

    if (job_queue_node_get_status(node) == JOB_QUEUE_RUNNING) {
      time_t now = time(NULL);
      if ( job_queue_get_max_job_duration(queue) > 0) {
        double elapsed = difftime(now, job_queue_node_get_sim_start( node ));
        if (elapsed > job_queue_get_max_job_duration(queue))
          job_queue_change_node_status(queue, node, JOB_QUEUE_DO_KILL);
      }
      if (job_queue_get_job_stop_time(queue) > 0) {
        if (now >= job_queue_get_job_stop_time(queue))
          job_queue_change_node_status(queue, node, JOB_QUEUE_DO_KILL);
      }
    }
  }
}
Exemple #8
0
bool job_queue_node_kill( job_queue_node_type * node , job_queue_status_type * status , queue_driver_type * driver) {
  bool result = false;
  pthread_mutex_lock( &node->data_mutex );
  {
    job_status_type current_status = job_queue_node_get_status( node );
    if (current_status & JOB_QUEUE_CAN_KILL) {
      /*
        If the job is killed before it is even started no driver
        specific job data has been assigned; we therefor must check
        the node->job_data pointer before entering.
      */
      if (node->job_data) {
        queue_driver_kill_job( driver , node->job_data );
        queue_driver_free_job( driver , node->job_data );
        node->job_data = NULL;
      }
      job_queue_status_transition(status, current_status, JOB_QUEUE_USER_KILLED);
      job_queue_node_set_status( node , JOB_QUEUE_USER_KILLED);
      result = true;
    }
  }
  pthread_mutex_unlock( &node->data_mutex );
  return result;
}
Exemple #9
0
void job_queue_run_jobs(job_queue_type * queue , int num_total_run, bool verbose) {
  int trylock = pthread_mutex_trylock( &queue->run_mutex );
  if (trylock != 0)
    util_abort("%s: another thread is already running the queue_manager\n",__func__);
  else if (!queue->user_exit) {
    /* OK - we have got an exclusive lock to the run_jobs code. */

    //Check if queue is open. Fails hard if not open
    job_queue_check_open(queue);

    /*
      The number of threads in the thread pool running callbacks. Memory consumption can
      potentially be quite high while running the DONE callback - should therefor not use
      too many threads.
    */
    const int NUM_WORKER_THREADS = 4;
    queue->work_pool = thread_pool_alloc( NUM_WORKER_THREADS , true );
    {
      bool new_jobs         = false;
      bool cont             = true;
      int  phase = 0;

      queue->running = true;
      do {
        bool local_user_exit = false;
        job_list_get_rdlock( queue->job_list );
        /*****************************************************************/
        if (queue->user_exit)  {/* An external thread has called the job_queue_user_exit() function, and we should kill
                                   all jobs, do some clearing up and go home. Observe that we will go through the
                                   queue handling codeblock below ONE LAST TIME before exiting. */
          job_queue_user_exit__( queue );
          local_user_exit = true;
        }

        job_queue_check_expired(queue);

        /*****************************************************************/
        {
          bool update_status = job_queue_update_status( queue );
          if (verbose) {
            if (update_status || new_jobs)
              job_queue_print_summary(queue , update_status );
            job_queue_update_spinner( &phase );
          }


          {
            int num_complete = job_queue_status_get_count(queue->status, JOB_QUEUE_SUCCESS) +
                               job_queue_status_get_count(queue->status, JOB_QUEUE_FAILED) +
                               job_queue_status_get_count(queue->status, JOB_QUEUE_IS_KILLED);

            if ((num_total_run > 0) && (num_total_run == num_complete))
              /* The number of jobs completed is equal to the number
                 of jobs we have said we want to run; so we are finished.
              */
              cont = false;
            else {
              if (num_total_run == 0) {
                /* We have not informed about how many jobs we will
                   run. To check if we are complete we perform the two
                   tests:

                     1. All the jobs which have been added with
                        job_queue_add_job() have completed.

                     2. The user has used job_queue_complete_submit()
                        to signal that no more jobs will be forthcoming.
                */
                if ((num_complete == job_list_get_size( queue->job_list )) && queue->submit_complete)
                  cont = false;
              }
            }
          }

          if (cont) {
            /* Submitting new jobs */
            int max_submit     = 5; /* This is the maximum number of jobs submitted in one while() { ... } below.
                                       Only to ensure that the waiting time before a status update is not too long. */
            int total_active   = job_queue_status_get_count(queue->status, JOB_QUEUE_PENDING) + job_queue_status_get_count(queue->status, JOB_QUEUE_RUNNING);
            int num_submit_new;

            {
              int max_running = job_queue_get_max_running( queue );
              if (max_running > 0)
                num_submit_new = util_int_min( max_submit ,  max_running - total_active );
              else
                /*
                   If max_running == 0 that should be interpreted as no limit; i.e. the queue layer will
                   attempt to send an unlimited number of jobs to the driver - the driver can reject the jobs.
                */
                num_submit_new = util_int_min( max_submit , job_queue_status_get_count(queue->status, JOB_QUEUE_WAITING));
            }

            new_jobs = false;
            if (job_queue_status_get_count(queue->status, JOB_QUEUE_WAITING) > 0)   /* We have waiting jobs at all           */
              if (num_submit_new > 0)                                               /* The queue can allow more running jobs */
                new_jobs = true;

            if (new_jobs) {
              int submit_count = 0;
              int queue_index  = 0;

              while ((queue_index < job_list_get_size( queue->job_list )) && (num_submit_new > 0)) {
                job_queue_node_type * node = job_list_iget_job( queue->job_list , queue_index );
                if (job_queue_node_get_status(node) == JOB_QUEUE_WAITING) {
                  {
                    submit_status_type submit_status = job_queue_submit_job(queue , queue_index);

                    if (submit_status == SUBMIT_OK) {
                      num_submit_new--;
                      submit_count++;
                    } else if ((submit_status == SUBMIT_DRIVER_FAIL) || (submit_status == SUBMIT_QUEUE_CLOSED))
                      break;
                  }
                }
                queue_index++;
              }
            }


            {
              /*
                Checking for complete / exited / overtime jobs
               */
              int queue_index;
              for (queue_index = 0; queue_index < job_list_get_size( queue->job_list ); queue_index++) {
                job_queue_node_type * node = job_list_iget_job( queue->job_list , queue_index );

                switch (job_queue_node_get_status(node)) {
                  case(JOB_QUEUE_DONE):
                    job_queue_handle_DONE(queue, node);
                    break;
                  case(JOB_QUEUE_EXIT):
                    job_queue_handle_EXIT(queue, node);
                    break;
                  case(JOB_QUEUE_DO_KILL_NODE_FAILURE):
                    job_queue_handle_DO_KILL_NODE_FAILURE(queue, node);
                    break;
                  case(JOB_QUEUE_DO_KILL):
                    job_queue_handle_DO_KILL(queue, node);
                    break;
                  default:
                    break;
                }


              }
            }
          } else
            /* print an updated status to stdout before exiting. */
            if (verbose)
              job_queue_print_summary(queue , true);
        }
        job_list_unlock( queue->job_list );
        if (local_user_exit)
          cont = false;    /* This is how we signal that we want to get out . */
        else {
          util_yield();
          job_list_reader_wait( queue->job_list , queue->usleep_time , 8 * queue->usleep_time);
        }
      } while ( cont );
    }
    if (verbose)
      printf("\n");
    thread_pool_join( queue->work_pool );
    thread_pool_free( queue->work_pool );
  }

  /*
    Set the queue's "open" flag to false to signal that the queue is
    not ready to be used in a new job_queue_run_jobs or
    job_queue_add_job method call as it has not been reset yet. Not
    resetting the queue here implies that the queue object is still
    available for queries after this method has finished
  */
  queue->open = false;
  queue->running = false;
  pthread_mutex_unlock( &queue->run_mutex );
}