// if status = running, and current_time > sim_start + max_confirm_wait // (usually 2 min), check if job is confirmed running (status_file exists). // If not confirmed, set job to JOB_QUEUE_FAILED. bool job_queue_node_update_status( job_queue_node_type * node , job_queue_status_type * status , queue_driver_type * driver ) { bool status_change = false; pthread_mutex_lock(&node->data_mutex); { if (node->job_data) { job_status_type current_status = job_queue_node_get_status(node); bool confirmed = job_queue_node_status_update_confirmed_running__(node); if ((current_status & JOB_QUEUE_RUNNING) && !confirmed) { // it's running, but not confirmed running. double runtime = job_queue_node_time_since_sim_start(node); if (runtime >= node->max_confirm_wait) { // max_confirm_wait has passed since sim_start without success; the job is dead job_status_type new_status = JOB_QUEUE_EXIT; status_change = job_queue_status_transition(status, current_status, new_status); job_queue_node_set_status(node, new_status); } } current_status = job_queue_node_get_status(node); if (current_status & JOB_QUEUE_CAN_UPDATE_STATUS) { job_status_type new_status = queue_driver_get_status( driver , node->job_data); status_change = job_queue_status_transition(status , current_status , new_status); job_queue_node_set_status(node,new_status); } } } pthread_mutex_unlock( &node->data_mutex ); return status_change; }
bool job_queue_node_kill( job_queue_node_type * node , job_queue_status_type * status , queue_driver_type * driver) { bool result = false; pthread_mutex_lock( &node->data_mutex ); { job_status_type current_status = job_queue_node_get_status( node ); if (current_status & JOB_QUEUE_CAN_KILL) { /* Jobs with status JOB_QUEUE_WAITING are killable - in the sense that status should be set to JOB_QUEUE_USER_KILLED; but they do not have any driver specific job_data, and the driver->kill_job() function can NOT be called. */ if (current_status != JOB_QUEUE_WAITING) { queue_driver_kill_job( driver , node->job_data ); if (node->job_data) { queue_driver_free_job( driver , node->job_data ); node->job_data = NULL; } } job_queue_status_transition(status, current_status, JOB_QUEUE_USER_KILLED); job_queue_node_set_status( node , JOB_QUEUE_USER_KILLED); result = true; } } pthread_mutex_unlock( &node->data_mutex ); return result; }
static void job_queue_user_exit__( job_queue_type * queue ) { int queue_index; for (queue_index = 0; queue_index < job_list_get_size( queue->job_list ); queue_index++) { job_queue_node_type * node = job_list_iget_job( queue->job_list , queue_index ); if (JOB_QUEUE_CAN_KILL & job_queue_node_get_status(node)) job_queue_node_status_transition(node,queue->status,JOB_QUEUE_DO_KILL); } }
void job_queue_node_restart( job_queue_node_type * node , job_queue_status_type * status) { pthread_mutex_lock( &node->data_mutex ); { job_status_type current_status = job_queue_node_get_status( node ); job_queue_status_transition(status, current_status, JOB_QUEUE_WAITING); job_queue_node_set_status( node , JOB_QUEUE_WAITING); job_queue_node_reset_submit_attempt(node); } pthread_mutex_unlock( &node->data_mutex ); }
bool job_queue_node_status_transition( job_queue_node_type * node , job_queue_status_type * status , job_status_type new_status) { bool status_change; pthread_mutex_lock( &node->data_mutex ); { job_status_type old_status = job_queue_node_get_status( node ); status_change = job_queue_status_transition(status , old_status, new_status); if (status_change) job_queue_node_set_status( node , new_status ); } pthread_mutex_unlock( &node->data_mutex ); return status_change; }
bool job_queue_node_update_status( job_queue_node_type * node , job_queue_status_type * status , queue_driver_type * driver) { bool status_change = false; pthread_mutex_lock( &node->data_mutex ); { if (node->job_data) { job_status_type current_status = job_queue_node_get_status(node); if (current_status & JOB_QUEUE_CAN_UPDATE_STATUS) { job_status_type new_status = queue_driver_get_status( driver , node->job_data); status_change = job_queue_status_transition(status , current_status , new_status); job_queue_node_set_status(node,new_status); } } } pthread_mutex_unlock( &node->data_mutex ); return status_change; }
static void job_queue_check_expired(job_queue_type * queue) { if ((job_queue_get_max_job_duration(queue) <= 0) && (job_queue_get_job_stop_time(queue) <= 0)) return; for (int i = 0; i < job_list_get_size( queue->job_list ); i++) { job_queue_node_type * node = job_list_iget_job( queue->job_list , i ); if (job_queue_node_get_status(node) == JOB_QUEUE_RUNNING) { time_t now = time(NULL); if ( job_queue_get_max_job_duration(queue) > 0) { double elapsed = difftime(now, job_queue_node_get_sim_start( node )); if (elapsed > job_queue_get_max_job_duration(queue)) job_queue_change_node_status(queue, node, JOB_QUEUE_DO_KILL); } if (job_queue_get_job_stop_time(queue) > 0) { if (now >= job_queue_get_job_stop_time(queue)) job_queue_change_node_status(queue, node, JOB_QUEUE_DO_KILL); } } } }
bool job_queue_node_kill( job_queue_node_type * node , job_queue_status_type * status , queue_driver_type * driver) { bool result = false; pthread_mutex_lock( &node->data_mutex ); { job_status_type current_status = job_queue_node_get_status( node ); if (current_status & JOB_QUEUE_CAN_KILL) { /* If the job is killed before it is even started no driver specific job data has been assigned; we therefor must check the node->job_data pointer before entering. */ if (node->job_data) { queue_driver_kill_job( driver , node->job_data ); queue_driver_free_job( driver , node->job_data ); node->job_data = NULL; } job_queue_status_transition(status, current_status, JOB_QUEUE_USER_KILLED); job_queue_node_set_status( node , JOB_QUEUE_USER_KILLED); result = true; } } pthread_mutex_unlock( &node->data_mutex ); return result; }
void job_queue_run_jobs(job_queue_type * queue , int num_total_run, bool verbose) { int trylock = pthread_mutex_trylock( &queue->run_mutex ); if (trylock != 0) util_abort("%s: another thread is already running the queue_manager\n",__func__); else if (!queue->user_exit) { /* OK - we have got an exclusive lock to the run_jobs code. */ //Check if queue is open. Fails hard if not open job_queue_check_open(queue); /* The number of threads in the thread pool running callbacks. Memory consumption can potentially be quite high while running the DONE callback - should therefor not use too many threads. */ const int NUM_WORKER_THREADS = 4; queue->work_pool = thread_pool_alloc( NUM_WORKER_THREADS , true ); { bool new_jobs = false; bool cont = true; int phase = 0; queue->running = true; do { bool local_user_exit = false; job_list_get_rdlock( queue->job_list ); /*****************************************************************/ if (queue->user_exit) {/* An external thread has called the job_queue_user_exit() function, and we should kill all jobs, do some clearing up and go home. Observe that we will go through the queue handling codeblock below ONE LAST TIME before exiting. */ job_queue_user_exit__( queue ); local_user_exit = true; } job_queue_check_expired(queue); /*****************************************************************/ { bool update_status = job_queue_update_status( queue ); if (verbose) { if (update_status || new_jobs) job_queue_print_summary(queue , update_status ); job_queue_update_spinner( &phase ); } { int num_complete = job_queue_status_get_count(queue->status, JOB_QUEUE_SUCCESS) + job_queue_status_get_count(queue->status, JOB_QUEUE_FAILED) + job_queue_status_get_count(queue->status, JOB_QUEUE_IS_KILLED); if ((num_total_run > 0) && (num_total_run == num_complete)) /* The number of jobs completed is equal to the number of jobs we have said we want to run; so we are finished. */ cont = false; else { if (num_total_run == 0) { /* We have not informed about how many jobs we will run. To check if we are complete we perform the two tests: 1. All the jobs which have been added with job_queue_add_job() have completed. 2. The user has used job_queue_complete_submit() to signal that no more jobs will be forthcoming. */ if ((num_complete == job_list_get_size( queue->job_list )) && queue->submit_complete) cont = false; } } } if (cont) { /* Submitting new jobs */ int max_submit = 5; /* This is the maximum number of jobs submitted in one while() { ... } below. Only to ensure that the waiting time before a status update is not too long. */ int total_active = job_queue_status_get_count(queue->status, JOB_QUEUE_PENDING) + job_queue_status_get_count(queue->status, JOB_QUEUE_RUNNING); int num_submit_new; { int max_running = job_queue_get_max_running( queue ); if (max_running > 0) num_submit_new = util_int_min( max_submit , max_running - total_active ); else /* If max_running == 0 that should be interpreted as no limit; i.e. the queue layer will attempt to send an unlimited number of jobs to the driver - the driver can reject the jobs. */ num_submit_new = util_int_min( max_submit , job_queue_status_get_count(queue->status, JOB_QUEUE_WAITING)); } new_jobs = false; if (job_queue_status_get_count(queue->status, JOB_QUEUE_WAITING) > 0) /* We have waiting jobs at all */ if (num_submit_new > 0) /* The queue can allow more running jobs */ new_jobs = true; if (new_jobs) { int submit_count = 0; int queue_index = 0; while ((queue_index < job_list_get_size( queue->job_list )) && (num_submit_new > 0)) { job_queue_node_type * node = job_list_iget_job( queue->job_list , queue_index ); if (job_queue_node_get_status(node) == JOB_QUEUE_WAITING) { { submit_status_type submit_status = job_queue_submit_job(queue , queue_index); if (submit_status == SUBMIT_OK) { num_submit_new--; submit_count++; } else if ((submit_status == SUBMIT_DRIVER_FAIL) || (submit_status == SUBMIT_QUEUE_CLOSED)) break; } } queue_index++; } } { /* Checking for complete / exited / overtime jobs */ int queue_index; for (queue_index = 0; queue_index < job_list_get_size( queue->job_list ); queue_index++) { job_queue_node_type * node = job_list_iget_job( queue->job_list , queue_index ); switch (job_queue_node_get_status(node)) { case(JOB_QUEUE_DONE): job_queue_handle_DONE(queue, node); break; case(JOB_QUEUE_EXIT): job_queue_handle_EXIT(queue, node); break; case(JOB_QUEUE_DO_KILL_NODE_FAILURE): job_queue_handle_DO_KILL_NODE_FAILURE(queue, node); break; case(JOB_QUEUE_DO_KILL): job_queue_handle_DO_KILL(queue, node); break; default: break; } } } } else /* print an updated status to stdout before exiting. */ if (verbose) job_queue_print_summary(queue , true); } job_list_unlock( queue->job_list ); if (local_user_exit) cont = false; /* This is how we signal that we want to get out . */ else { util_yield(); job_list_reader_wait( queue->job_list , queue->usleep_time , 8 * queue->usleep_time); } } while ( cont ); } if (verbose) printf("\n"); thread_pool_join( queue->work_pool ); thread_pool_free( queue->work_pool ); } /* Set the queue's "open" flag to false to signal that the queue is not ready to be used in a new job_queue_run_jobs or job_queue_add_job method call as it has not been reset yet. Not resetting the queue here implies that the queue object is still available for queries after this method has finished */ queue->open = false; queue->running = false; pthread_mutex_unlock( &queue->run_mutex ); }