void gw_rm_wait_success(void *_job_id) { int job_id; gw_msg_t msg; int length; gw_job_t * job; gw_connection_list_t * connection; int rc; job_id = *((int *) _job_id); length = sizeof(gw_msg_t); free(_job_id); job = gw_job_pool_get(job_id, GW_TRUE); if (job == NULL) msg.rc = GW_RC_FAILED_BAD_JOB_ID; else { if ( job->job_state == GW_JOB_STATE_FAILED) msg.rc = GW_RC_FAILED_JOB_FAIL; else msg.rc = GW_RC_SUCCESS; msg.exit_code = job->exit_code; msg.array_id = job->array_id; } msg.msg_type = GW_MSG_WAIT; msg.job_id = job_id; connection = gw_connection_list_get(&(gw_rm.connection_list), GW_MSG_WAIT, job_id); if ( connection == NULL ) gw_log_print("RM",'W',"Connection for job %i has been closed (WAIT_SUCCESS).\n", job_id); else while ( connection != NULL )/*Notify all clients waiting for this job*/ { if ( job != NULL ) job->client_waiting--; rc = send(connection->socket_fs,(void *)&msg,length,0); if ( rc == -1 ) gw_log_print("RM",'E',"Error sending message %s\n",strerror(errno)); /* If in a wait-any remove pending waits of this client */ if ( connection->wait_type == GW_MSG_WAIT_ANY ) gw_rm_wait_remove_anys(connection->socket_fs); free (connection); connection = gw_connection_list_get(&(gw_rm.connection_list), GW_MSG_WAIT, job_id); } pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_failed ( void *_job_id ) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_FAILED).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set state */ /* ----------------------------------------------------------- */ gw_log_print("DM",'I',"Job %i failed.\n",job->id); gw_job_set_state(job, GW_JOB_STATE_FAILED, GW_FALSE); gw_job_print(job,"DM",'I',"Job failed, history:\n"); gw_job_print_history(job); job->exit_time = time(NULL); if (job->history != NULL) job->history->reason = GW_REASON_EXECUTION_ERROR; if ( job->client_waiting > 0 ) gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id); else free(_job_id); /* -------- Update Host & User running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); pthread_mutex_lock(&(job->history->host->mutex)); job->history->host->running_jobs--; pthread_mutex_unlock(&(job->history->host->mutex)); pthread_mutex_unlock(&(job->mutex)); }
void gw_rm_job_status(int client_socket, int job_id) { gw_job_t * job; gw_msg_job_t msg; int rc; int length; gw_history_t * tmp; job = gw_job_pool_get(job_id, GW_TRUE); length = sizeof(gw_msg_job_t); msg.msg_type = GW_MSG_JOB_STATUS; if ( job == NULL ) { msg.rc = GW_RC_FAILED_BAD_JOB_ID; msg.id = job_id; rc = send(client_socket,(void *) &msg,length,0); return; } if ( job->owner != NULL ) strncpy(msg.owner, job->owner,GW_MSG_STRING_SHORT); msg.rc = GW_RC_SUCCESS; msg.id = job->id; msg.array_id = job->array_id; msg.task_id = job->task_id; msg.total_tasks = job->total_tasks; msg.uid = job->user_id; msg.fixed_priority = job->fixed_priority; msg.deadline = job->template.deadline;
void gw_rm_wait_remove_anys(int client_socket) { gw_connection_list_t * any; gw_job_t * job; any = gw_connection_list_get_by_client(&(gw_rm.connection_list), GW_MSG_WAIT, client_socket); while ( any != NULL ) { job = gw_job_pool_get(any->job_id, GW_TRUE); if ( job != NULL ) { job->client_waiting--; pthread_mutex_unlock(&(job->mutex)); } free(any); any = gw_connection_list_get_by_client(&(gw_rm.connection_list), GW_MSG_WAIT, client_socket); } }
void gw_em_cancel(void *_job_id) { int job_id; gw_job_t *job; gw_em_mad_t *mad; gw_em_state_t current_em_state; if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); free(_job_id); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("EM",'E',"Job %s no longer exists (CANCEL).\n", job_id); return; } } else return; /* -------------------------------------------------------------------- */ current_em_state = job->em_state; if ( issubmitted(current_em_state) ) { gw_job_print (job,"EM",'I',"Cancelling job.\n"); mad = job->history->em_mad; /* Warning! When in Migration Cancel, the previous MAD should be used */ if (job->job_state == GW_JOB_STATE_MIGR_CANCEL) { if (job->history->next == NULL) { gw_log_print("EM",'E',"Previous history record of job %i no longer exists\n", job_id); pthread_mutex_unlock(&(job->mutex)); return; } else mad = job->history->next->em_mad; } gw_em_mad_cancel(mad, job_id); gw_log_print ("EM",'I',"Cancelling job %i.\n", job_id); } else gw_log_print ("EM",'W',"Ignoring cancel request for job %i, will re-try.\n", job_id); /* -------------------------------------------------------------------- */ pthread_mutex_unlock(&(job->mutex)); }
void gw_rm_job_match(int client_socket, int job_id) { int i,j,rc; int number_of_queues; gw_host_t * host; gw_job_t * job; gw_msg_match_t msg; gw_boolean_t match; int length; int gwfreenc; msg.msg_type = GW_MSG_JOB_MATCH; length = sizeof(gw_msg_match_t); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { msg.msg_type = GW_MSG_END_JOB; msg.rc = GW_RC_FAILED_BAD_JOB_ID; msg.job_id = job_id; rc = send(client_socket,(void *) &msg,length,0); return; } for (i=0;i<gw_conf.number_of_hosts;i++) { host = gw_host_pool_get_host (i, GW_TRUE); if ( host != NULL ) { number_of_queues = 0; msg.rc = GW_RC_SUCCESS; msg.matched = GW_FALSE; msg.host_id = i; msg.job_id = job_id; msg.fixed_priority = host->fixed_priority; msg.running_jobs = host->running_jobs; gw_rm_copy_str_host(host->hostname, msg.hostname); for (j=0;j<GW_HOST_MAX_QUEUES;j++) { if (host->queue_name[j]!=NULL) { gw_rm_copy_str_short(host->queue_name[j], msg.queue_name[number_of_queues]); match = gw_host_check_reqs(host, j, job->template.requirements); if (match == GW_TRUE) { msg.matched = GW_TRUE; msg.match[number_of_queues] = 1; msg.rank [number_of_queues] = gw_host_compute_rank(host,j,job->template.rank);
void gw_dm_hold (void *_job_id) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (HOLD).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_HOLD_FAILED", _job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Hold the job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_INIT: case GW_JOB_STATE_PENDING: gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE); gw_log_print("DM",'I',"Job %i held.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_HOLD_SUCCESS", _job_id); break; default: gw_log_print("DM",'W',"Job %i can not be held in current state.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_HOLD_FAILED", _job_id); break; } pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_kill_hard (void *_job_id) { gw_job_t * job; int job_id; int rt; int array_id; int task_id; gw_array_t * array; char conf_filename[GW_MSG_STRING_LONG]; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (KILL_HARD).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED",_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Hard Kill the job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_MIGR_PROLOG: case GW_JOB_STATE_MIGR_EPILOG: gw_host_dec_rjobs(job->history->next->host); job->history->next->stats[EXIT_TIME] = time(NULL); case GW_JOB_STATE_PROLOG: gw_host_dec_uslots(job->history->host, job->template.np);
void gw_rm_resume_failed (void *_job_id) { int job_id; gw_msg_t msg; int length; gw_job_t * job; gw_connection_list_t * connection; int rc; job_id = *( (int *) _job_id ); length = sizeof(gw_msg_t); free(_job_id); job = gw_job_pool_get(job_id, GW_FALSE); if (job == NULL) msg.rc = GW_RC_FAILED_BAD_JOB_ID; else msg.rc = GW_RC_FAILED_BAD_JOB_STATE; msg.msg_type = GW_MSG_RELEASE; msg.job_id = job_id; connection = gw_connection_list_get(&(gw_rm.connection_list), GW_MSG_RESUME, job_id); if ( connection == NULL ) gw_log_print("RM",'W',"Connection for job %i has been closed (RESUME_FAILED).\n", job_id); else while ( connection != NULL ) { rc = send(connection->socket_fs, (void *) &msg, length, 0); if ( rc == -1 ) gw_log_print("RM",'E',"Error sending message %s\n",strerror(errno)); free (connection); connection = gw_connection_list_get(&(gw_rm.connection_list), GW_MSG_RELEASE, job_id); } }
void gw_dm_stopped ( void *_job_id ) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_STOPPED).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Update Job state */ /* ----------------------------------------------------------- */ gw_job_set_state(job, GW_JOB_STATE_STOPPED, GW_FALSE); /* -------- Update Host & User running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); gw_host_dec_rjobs(job->history->host); /* ----------------------------------------------------------- */ /* 2.- Notify Request Manager */ /* ----------------------------------------------------------- */ gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_SUCCESS", _job_id); pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_migr_cancel ( void *_job_id ) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_MGR_CANCEL).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Check we still need to migrate this job */ /* ----------------------------------------------------------- */ if ( (job->job_state == GW_JOB_STATE_WRAPPER) && (job->em_state != GW_EM_STATE_DONE)) { job->history->stats[MIGRATION_START_TIME] = time(NULL); gw_job_set_state(job, GW_JOB_STATE_MIGR_CANCEL, GW_FALSE); gw_am_trigger(gw_dm.em_am, "GW_EM_CANCEL", _job_id); } else { gw_log_print("DM",'W',"Can't migrate %i to in current state.\n",job->id); free(_job_id); } pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_wait (void *_job_id) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (WAIT).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_FAILED", _job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Wait for the job */ /* ----------------------------------------------------------- */ job->client_waiting++; switch (job->job_state) { case GW_JOB_STATE_ZOMBIE: case GW_JOB_STATE_FAILED: gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id); break; default: free(_job_id); break; } pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_epilog_std ( void *_job_id ) { gw_job_t * job; int job_id; int index; int num_xfrs; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_EPILOG_STD).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set state and times */ /* ----------------------------------------------------------- */ gw_job_set_state(job, GW_JOB_STATE_EPILOG_STD, GW_FALSE); job->history->stats[EPILOG_START_TIME] = time(NULL); /* ----------------------------------------------------------- */ /* 2.- Signal the Transfer Manager */ /* ----------------------------------------------------------- */ gw_xfr_destroy (&(job->xfrs)); index = 0; num_xfrs = 2; gw_xfr_init(&(job->xfrs), num_xfrs, job->template.number_of_retries);
void gw_dm_wrapper ( void *_job_id ) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set state and times */ /* ----------------------------------------------------------- */ job->history->stats[WRAPPER_START_TIME] = time(NULL); gw_job_set_state(job, GW_JOB_STATE_WRAPPER, GW_FALSE); /* ----------------------------------------------------------- */ /* 2.- Signal the Execution Manager */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; gw_am_trigger(gw_dm.em_am, "GW_EM_SUBMIT", _job_id); pthread_mutex_unlock(&(job->mutex)); }
void gw_rm_job_history(int client_socket, int job_id) { gw_job_t * job; gw_msg_history_t msg; int rc; int length; gw_history_t * tmp; job = gw_job_pool_get(job_id, GW_TRUE); length = sizeof(gw_msg_history_t); msg.msg_type = GW_MSG_JOB_HISTORY; if ( job == NULL ) { msg.rc = GW_RC_FAILED_BAD_JOB_ID; rc = send(client_socket,(void *) &msg,length,0); return; } tmp = job->history; while (tmp != NULL ) { gw_rm_history_to_msg (tmp, &msg); rc = send(client_socket,(void *) &msg,length,0); if ( rc == -1 ) gw_log_print("RM",'E',"Error sending message %s\n",strerror(errno)); tmp = tmp->next; } pthread_mutex_unlock(&(job->mutex)); msg.msg_type = GW_MSG_END; msg.rc = GW_RC_SUCCESS; rc = send(client_socket,(void *) &msg,length,0); if ( rc == -1 ) gw_log_print("RM",'E',"Error sending message %s\n",strerror(errno)); }
void gw_em_timer() { int i; gw_job_t *job; time_t now; static int mark = 0; int *_job_id; gw_em_mad_t *mad; mark = mark + GW_EM_TIMER_PERIOD; if ( mark >= 300 ) { gw_log_print("EM",'I',"-- MARK --\n"); mark = 0; } now = time(NULL); for (i= 0; i< gw_conf.number_of_jobs ; i++) { job = gw_job_pool_get(i, GW_TRUE); if ( job != NULL ) { if ( job->history == NULL ) { pthread_mutex_unlock(&(job->mutex)); continue; } if ( (job->job_state == GW_JOB_STATE_PRE_WRAPPER) || (job->job_state == GW_JOB_STATE_WRAPPER) || (job->job_state == GW_JOB_STATE_MIGR_CANCEL) || (job->job_state == GW_JOB_STATE_STOP_CANCEL) || (job->job_state == GW_JOB_STATE_KILL_CANCEL)) { if (issubmitted(job->em_state)) { if ( now >= job->next_poll_time ) { gw_log_print("EM",'I',"Checking execution state of job %i.\n", i); mad = job->history->em_mad; /* Warning! When in Migration Cancel, the previous MAD should be used */ if (job->job_state == GW_JOB_STATE_MIGR_CANCEL) { if (job->history->next == NULL) { gw_log_print("EM",'E',"Previous history record of job %i no longer exists\n", i); pthread_mutex_unlock(&(job->mutex)); continue; } else mad = job->history->next->em_mad; } gw_em_mad_poll(mad, i); job->next_poll_time += gw_conf.poll_interval; /* Wait for next poll */ } } else if ((job->em_state == GW_EM_STATE_FAILED) && (job->history->counter != -1)) { job->history->counter--; if (job->history->counter == 0) { job->history->counter = -1; _job_id = (int *) malloc (sizeof(int)); *(_job_id) = i; gw_am_trigger(&(gw_em.am),"GW_EM_SUBMIT", _job_id); } } } pthread_mutex_unlock(&(job->mutex)); } } }
void gw_em_submit(void *_job_id) { int job_id; gw_job_t *job; char *rsl=NULL; char *contact; gw_job_state_t state; char rsl_filename[2048]; FILE *fd; time_t now; /* ----------------------------------------------------------- */ /* 0.- Get job pointer, check if it exits and lock mutex */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("EM",'E',"Job %s no longer exists (PENDING).\n", job_id); return; } } else return; if (job->history == NULL) { gw_log_print("EM",'E',"History of job %s doesn't exists\n", job_id); free(_job_id); pthread_mutex_unlock(&(job->mutex)); return; } state = job->job_state; /* ----------------------------------------------------------- */ /* 1.- Get execution MAD for this host */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; job->history->counter = -1; if ( job->job_state == GW_JOB_STATE_PRE_WRAPPER ) { contact = job->history->em_fork_rc; rsl = (char *) job->history->em_mad->pre_wrapper_rsl((void *) job); } else { contact = job->history->em_rc; rsl = (char *) job->history->em_mad->wrapper_rsl((void *) job); } if ( rsl == NULL ) { job->em_state = GW_EM_STATE_FAILED; gw_log_print("EM",'E',"Job %i failed, could not generate RSL.\n", job_id); gw_am_trigger(gw_em.dm_am, "GW_DM_WRAPPER_FAILED", _job_id); pthread_mutex_unlock(&(job->mutex)); return; } sprintf(rsl_filename, "%s/job.rsl.%i", job->directory,job->restarted); fd = fopen(rsl_filename,"w"); if (fd != NULL ) { gw_job_print(job,"EM",'I',"Submitting wrapper to %s, RSL used is in %s.\n",contact,rsl_filename); fprintf(fd,"%s",rsl); fclose(fd); } else { job->em_state = GW_EM_STATE_FAILED; gw_log_print("EM",'E',"Job %i failed, could not open RSL file.\n", job_id); gw_job_print(job,"EM",'E',"Job failed, could not open RSL file %s.\n",rsl_filename); gw_am_trigger(gw_em.dm_am, "GW_DM_WRAPPER_FAILED", _job_id); pthread_mutex_unlock(&(job->mutex)); return; } /* -------------------------------------------------------------------- */ now = time(NULL); job->next_poll_time = now + gw_conf.poll_interval/2 + gw_rand(gw_conf.poll_interval); /* randomize polls */ gw_job_print(job,"EM",'I',"Job will be polled in %d seconds.\n", job->next_poll_time-now); job->last_checkpoint_time = 0; job->history->stats[LAST_SUSPENSION_TIME] = now; job->history->stats[SUSPENSION_TIME] = 0; job->history->stats[ACTIVE_TIME] = 0; job->history->tries++; /* -------------------------------------------------------------------- */ pthread_mutex_unlock(&(job->mutex)); gw_em_mad_submit(job->history->em_mad, job_id, contact, rsl_filename); /* -------------------------------------------------------------------- */ free(_job_id); free(rsl); }
void gw_dm_reschedule (void *_job_id) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (RE-SCHEDULE).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_FAILED", _job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- re-schedule job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_WRAPPER: job->reschedule = GW_TRUE; job->history->reason = GW_REASON_USER_REQUESTED; gw_log_print("DM",'I',"Job %i will be re-scheduled.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_SUCCESS",_job_id); gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], job_id, job->array_id, job->user_id, GW_REASON_USER_REQUESTED); break; case GW_JOB_STATE_FAILED: gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE); job->tm_state = GW_TM_STATE_INIT; job->em_state = GW_EM_STATE_INIT; job->history->reason = GW_REASON_USER_REQUESTED; job->restarted++; gw_log_print("DM",'I',"Job %i will be re-scheduled.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_SUCCESS", _job_id); gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], job_id, job->array_id, job->user_id, GW_REASON_NONE); break; default: gw_log_print("DM",'I',"Job %i can not be re-scheduled in current state.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_FAILED", _job_id); break; } pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_pending( void *_job_id ) { gw_job_t *job; int job_id; gw_boolean_t failed; gw_migration_reason_t reason; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_PENDING).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set state */ /* ----------------------------------------------------------- */ gw_job_print(job,"DM",'I',"Rescheduling job.\n"); gw_log_print("DM",'I',"Rescheduling job %d.\n", job->id); gw_job_set_state (job, GW_JOB_STATE_PENDING, GW_FALSE); /* -------- Update Host & User running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); gw_host_dec_rjobs(job->history->host); /* ------------- Restart counter --------------- */ job->restarted++; /* ------------- Notify the Scheduler --------------- */ if (job->history != NULL) { reason = GW_REASON_NONE; failed = (job->history->reason == GW_REASON_EXECUTION_ERROR) || (job->history->reason == GW_REASON_PERFORMANCE); if (failed) { gw_dm_mad_job_failed(&gw_dm.dm_mad[0], job->history->host->host_id, job->user_id, job->history->reason); } gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], job_id, job->array_id, job->user_id, reason); } else gw_log_print("DM",'E',"Rescheduling job %d, but no history records found.\n", job->id); /* ------------------------------------------------- */ free(_job_id); pthread_mutex_unlock(&(job->mutex)); }
int gw_array_pool_array_allocate (const gw_msg_t * msg, int number_of_tasks, int * array_id) { int jid, i, tid; int found, tries, rc; gw_job_t *job; gw_array_t *array; pthread_mutex_lock(&(gw_array_pool.mutex)); /* ------- Check if there is enough space for the array -------- */ if ( number_of_tasks + gw_job_pool_get_num_jobs() > gw_conf.number_of_jobs ) { pthread_mutex_unlock(&(gw_array_pool.mutex)); return -1; } *array_id = ( gw_array_pool.last_array_id + 1 ) % gw_conf.number_of_arrays; found = 0; tries = 0; while(!found && (tries < gw_conf.number_of_arrays)) { found = gw_array_pool.pool[*array_id] == NULL; if(!found) { tries++; *array_id = (*array_id+1) % gw_conf.number_of_arrays; } } if (!found) { pthread_mutex_unlock(&(gw_array_pool.mutex)); return -1; } gw_array_pool.last_array_id = *array_id; gw_array_pool.pool[*array_id] = (gw_array_t *) malloc (sizeof (gw_array_t) ); array = gw_array_pool.pool[*array_id]; rc = gw_array_init (array, number_of_tasks, *array_id); if (rc != 0) { pthread_mutex_unlock(&(gw_array_pool.mutex)); return -1; } pthread_mutex_lock(&(array->mutex)); for (i = 0; i<number_of_tasks ; i++) { jid = gw_job_pool_allocate(); tid = gw_array_add_task(array, jid); job = gw_job_pool_get(jid, GW_TRUE); job->task_id = tid; job->array_id = *array_id; job->total_tasks = number_of_tasks; pthread_mutex_unlock(&(job->mutex)); } pthread_mutex_unlock(&(array->mutex)); gw_array_pool.number_of_arrays++; pthread_mutex_unlock(&(gw_array_pool.mutex)); return 0; }
void gw_dm_stop (void *_job_id) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (STOP).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_FAILED", _job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Stop the job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_STOPPED: gw_log_print("DM",'W',"Job %i already stopped.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_FAILED", _job_id); break; case GW_JOB_STATE_WRAPPER: if (job->history != NULL ) job->history->reason = GW_REASON_STOP_RESUME; gw_log_print("DM",'I',"Stopping job %i.\n", job_id); gw_job_set_state(job, GW_JOB_STATE_STOP_CANCEL, GW_FALSE); if ( job->reschedule == GW_TRUE ) { job->reschedule = GW_FALSE; gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id); } gw_am_trigger(gw_dm.em_am, "GW_EM_CANCEL", _job_id); break; default: gw_log_print("DM",'W',"Job %i can not be stopped in current state.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_FAILED", _job_id); break; } pthread_mutex_unlock(&(job->mutex)); }
void gw_em_listener(void *arg) { fd_set in_pipes; int i,j; int *job_id; int greater, rc, rcm; char c; char info[GW_EM_MAX_INFO]; char s_job_id[GW_EM_MAX_JOB_ID]; char result[GW_EM_MAX_RESULT]; char action[GW_EM_MAX_ACTION]; char str[GW_EM_MAX_STRING]; int fd; gw_job_t *job; time_t now; char contact_file[PATH_MAX]; FILE *file; gw_em_mad_t *em_mad; char *ptmp; int *fds; int num_fds; gw_em_mad_t **em_mads; pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); fds = (int *) malloc(sizeof(int)*gw_conf.number_of_users*GW_MAX_MADS); em_mads = (gw_em_mad_t **) malloc(sizeof(gw_em_mad_t *) * gw_conf.number_of_users * GW_MAX_MADS); while (1) { greater = gw_user_pool_set_em_pipes(&in_pipes, fds, &num_fds, em_mads, gw_em.um_em_pipe_r); rc = select( greater+1, &in_pipes, NULL, NULL, NULL); if ( rc <= 0 ) continue; for (i=0; i<num_fds; i++) { fd = fds[i]; if ( FD_ISSET(fd, &in_pipes) ) { if ( fd == gw_em.um_em_pipe_r ) { rc = read(fd, (void *) &c, sizeof(char)); #ifdef GWEMDEBUG gw_log_print("EM",'D',"Updating MAD pipes (action is %c)\n",c); #endif continue; } #ifdef GWEMDEBUG gw_log_print("EM",'D',"Reading from MAD pipe %i.\n",i); #endif j = 0; do { rc = read(fd, (void *) &c, sizeof(char)); str[j++] = c; } while ((rc > 0) && (c != '\n') && (j < (GW_EM_MAX_STRING-1))); str[j] = '\0'; if (rc <= 0) { gw_log_print("EM",'W',"Error reading MAD (%s) message\n", em_mads[i]->name); rcm = gw_em_mad_reload(em_mads[i]); if ( rcm == 0 ) { gw_log_print("EM",'I',"MAD (%s) successfully reloaded\n", em_mads[i]->name); gw_job_pool_em_recover(em_mads[i], &(gw_em.am)); } else { gw_log_print("EM",'E',"Error reloading MAD (%s)\n", em_mads[i]->name); em_mads[i]->mad_em_pipe = -1; } continue; } sscanf(str,"%" GW2STR(GW_EM_MAX_ACTION) "s %" GW2STR(GW_EM_MAX_JOB_ID) "s %" GW2STR(GW_EM_MAX_RESULT) "s %" GW2STR(GW_EM_MAX_INFO) "[^\n]", action, s_job_id, result, info); #ifdef GWEMDEBUG gw_log_print("EM",'D',"MAD message received:\"%s %s %s %s\".\n", action, s_job_id, result, info); #endif if (s_job_id[0] == '-') continue; job_id = (int *) malloc (sizeof(int)); *job_id = atoi(s_job_id); job = gw_job_pool_get(*job_id, GW_TRUE); if (job == NULL) { gw_log_print("EM",'W',"MAD message for job %s, but it does not exist: \"%s %s %s %s\".\n", s_job_id,action, s_job_id, result, info); free(job_id); continue; } if (job->job_state != GW_JOB_STATE_PRE_WRAPPER && job->job_state != GW_JOB_STATE_WRAPPER && job->job_state != GW_JOB_STATE_MIGR_CANCEL && job->job_state != GW_JOB_STATE_STOP_CANCEL && job->job_state != GW_JOB_STATE_KILL_CANCEL) { gw_log_print("EM",'W',"MAD message for job %i but not in an execution state.\n", *job_id); free(job_id); pthread_mutex_unlock(&(job->mutex)); continue; } else if ( job->em_state == GW_EM_STATE_HARD_KILL ) { gw_log_print("EM",'W',"MAD message for job %i but it is being killed (hard).\n", *job_id); free(job_id); pthread_mutex_unlock(&(job->mutex)); continue; } if (strcmp(action, "SUBMIT") == 0) { if (strcmp(result, "FAILURE") == 0) { gw_job_print(job, "EM",'E',"Job submission failed: %s\n", info); gw_log_print("EM",'E',"Submission of job %d failed: %s.\n", job->id, info); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED", (void *) job_id); } else /* Save persistent job contact */ { snprintf(contact_file, PATH_MAX-1, GW_VAR_DIR "/%i/job.contact", job->id); file = fopen(contact_file, "w"); if ( file != NULL ) { fprintf(file, "%s\n", info); fclose(file); } gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING", (void *) job_id); } } else if (strcmp(action, "CANCEL") == 0) { if (strcmp(result, "SUCCESS") == 0) { gw_job_print(job, "EM",'I',"Job cancel succeeded.\n"); gw_log_print("EM",'I',"Cancel of job %i succeeded.\n", *job_id); } else { gw_job_print(job, "EM",'E',"Job cancel failed (%s).\n",info); gw_log_print("EM",'E',"Cancel of job %d failed: %s.\n",job->id, info); } } else if (strcmp(action, "POLL") == 0) { if (strcmp(result, "SUCCESS") == 0) { now = time(NULL); job->next_poll_time = now + gw_conf.poll_interval/2 + gw_rand(gw_conf.poll_interval); gw_job_print(job, "EM",'E',"Job poll OK (%s), will poll again in %d seconds.\n", info, job->next_poll_time - now); if (strcmp(info, "PENDING") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING", (void *) job_id); else if (strcmp(info, "SUSPENDED") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_SUSPENDED", (void *) job_id); else if (strcmp(info, "ACTIVE") == 0) gw_am_trigger(&(gw_em.am),"GW_EM_STATE_ACTIVE", (void *) job_id); else if (strstr(info, "DONE") != NULL) { ptmp = strstr(info,"DONE:"); if ((ptmp != NULL) && (strlen(ptmp+5) > 0))/*No-wrapper mode*/ job->exit_code=atoi(ptmp+5); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } else if (strcmp(info, "FAILED") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED", (void *) job_id); } else { job->history->failed_polls++; em_mad = job->history->em_mad; if ( job->history->failed_polls == 3 ) { gw_job_print(job, "EM",'E',"Job poll failed (%s), assuming the job is done.\n",info); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } else { now = time(NULL); job->next_poll_time = now + gw_conf.poll_interval*job->history->failed_polls + gw_rand(gw_conf.poll_interval*job->history->failed_polls); gw_job_print(job, "EM",'E',"Job poll failed (%s), will poll again in %d seconds.\n", info, job->next_poll_time - now); free(job_id); } } } else if (strcmp(action, "RECOVER") == 0) { if (strcmp(result, "SUCCESS") == 0) { if (strcmp(info, "PENDING") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING", (void *) job_id); else if (strcmp(info, "SUSPENDED") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_SUSPENDED", (void *) job_id); else if (strcmp(info, "ACTIVE") == 0) gw_am_trigger(&(gw_em.am),"GW_EM_STATE_ACTIVE", (void *) job_id); else if (strstr(info, "DONE") != NULL) { ptmp = strstr(info,"DONE:"); if ((ptmp != NULL) && (strlen(ptmp+5) > 0))/*No-wrapper mode*/ job->exit_code = atoi(ptmp+5); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } else if (strcmp(info, "FAILED") == 0) { /* Do not retry */ job->history->tries= job->template.number_of_retries; gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED", (void *) job_id); } } else { gw_job_print(job,"EM",'E',"Job recover failed (%s), assuming the job is done.\n", info); gw_log_print("EM",'E',"Recover of job %i failed.\n", *job_id); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } }
void gw_dm_zombie ( void *_job_id ) { gw_job_t * job; gw_array_t * array; int job_id; int task_id; int array_id; int rt; char conf_filename[2048]; time_t prolog, epilog; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id == NULL ) return; job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_ZOMBIE).\n",job_id); free(_job_id); return; } /* ----------------------------------------------------------- */ /* 0.- Update Job state */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_EPILOG: gw_job_set_state(job, GW_JOB_STATE_ZOMBIE, GW_FALSE); gw_log_print("DM",'I',"Job %i done, with exit code %i.\n",job->id, job->exit_code); job->history->reason = GW_REASON_NONE; job->exit_time = time(NULL); /* ------------- Print job history and send usage ------------ */ gw_job_print(job,"DM",'I',"Job done, history:\n"); gw_job_print_history(job); gw_job_send_usage(job); if ( job->client_waiting > 0 ) gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id); else { if (gw_conf.dispose == GW_TRUE) gw_am_trigger(&(gw_dm.am), "GW_DM_KILL", _job_id); else free(_job_id); } /* -------- Update User & Host running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); gw_host_dec_rjobs(job->history->host); /* -------- Notify the scheduler -------- */ prolog = gw_job_history_get_prolog_time(job->history); epilog = gw_job_history_get_epilog_time(job->history); gw_dm_mad_job_success(&gw_dm.dm_mad[0], job->history->host->host_id, job->user_id, (prolog + epilog), job->history->stats[SUSPENSION_TIME], job->history->stats[ACTIVE_TIME]); pthread_mutex_unlock(&(job->mutex)); /* -------- Update other jobs dependencies -------- */ gw_job_pool_dep_check(job_id); break; case GW_JOB_STATE_KILL_EPILOG: gw_job_set_state(job, GW_JOB_STATE_ZOMBIE, GW_FALSE); job->exit_time = time(NULL); /* ------------- Print job history and send usage ------------ */ gw_job_print(job,"DM",'I',"Job killed, history:\n"); gw_job_print_history(job); gw_job_send_usage(job); /* ---------------- Free job & Notify RM ---------------- */ array_id = job->array_id; task_id = job->task_id; /* -------- Update User & Host running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); gw_host_dec_rjobs(job->history->host); sprintf(conf_filename, "%s/job.conf", job->directory); unlink(conf_filename); pthread_mutex_unlock(&(job->mutex)); /* ------------------------------------------------- */ gw_job_pool_free(job_id); gw_log_print("DM",'I',"Job %i killed and freed.\n", job_id); if (array_id != -1) { array = gw_array_pool_get_array(array_id,GW_TRUE); if ( array != NULL ) { rt = gw_array_del_task(array,task_id); pthread_mutex_unlock(&(array->mutex)); if (rt == 0) { gw_array_pool_array_free(array_id); gw_log_print("DM",'I',"Array %i freed\n",array_id); } } else gw_log_print("DM",'E',"Could not delete task %i from array %i.\n", task_id, array_id); } gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_SUCCESS", _job_id); break; default: gw_log_print("DM",'E',"Zombie callback in wrong job (%i) state.\n", job_id); free(_job_id); pthread_mutex_unlock(&(job->mutex)); break; } }
void gw_dm_wrapper_done_cb ( void *_job_id ) { gw_job_t * job; int job_id; time_t total; time_t active; time_t suspension; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_DONE_CB).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set execution times & state transition */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; switch (job->job_state) { case GW_JOB_STATE_PRE_WRAPPER: /* --------------- Update pre-wrapper stats -------------------- */ job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_pre_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Pre-Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Transition to Wrapper state ------------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_WRAPPER", _job_id); break; case GW_JOB_STATE_WRAPPER: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host); /* ---------- We do not need to re-schedule this job --------- */ if ( job->reschedule == GW_TRUE ) { job->reschedule = GW_FALSE; gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id); } /* -------------- Transition to Epilog state ------------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_STD", _job_id); break; case GW_JOB_STATE_STOP_CANCEL: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host); /* ------------ Transition to Stop Epilog state --------------- */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_STOP_EPILOG", _job_id); break; case GW_JOB_STATE_KILL_CANCEL: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host); /* ------------ Transition to Kill Epilog state ---------------- */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_KILL_EPILOG", _job_id); break; case GW_JOB_STATE_MIGR_CANCEL: /* ----------- Update previous wrapper stats ------------------- */ job->history->next->stats[WRAPPER_EXIT_TIME] = time(NULL); active = job->history->next->stats[ACTIVE_TIME]; suspension = job->history->next->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); /* -------------- Free used slot from previous host ------------ */ gw_host_dec_uslots(job->history->next->host); /* ---------- Transition to Migration Prolog state ------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_MIGR_PROLOG", _job_id); break; default: gw_log_print("DM",'E',"Wrapper done callback in wrong job (%i) state.\n", job_id); free(_job_id); break; } pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_aalloc (void *_msg) { gw_msg_t *msg; gw_array_t *array; gw_job_t *job; gw_boolean_t useradd; gw_job_state_t init_state; int tasks; int rc; int array_id, i, jid, uid; msg = (gw_msg_t *) _msg; useradd = gw_user_pool_exists (msg->owner, &uid) == GW_FALSE; if (useradd) { rc = gw_user_pool_user_allocate (msg->owner, &uid); if ( rc != 0 ) { gw_log_print("DM",'E',"Could not register user %s.\n", msg->owner); msg->rc = GW_RC_FAILED_USER; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } } rc = gw_array_pool_array_allocate(msg, msg->number_of_tasks, &array_id); if ( rc != 0 ) { gw_log_print("DM",'E',"Could not allocate array.\n"); msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } array = gw_array_pool_get_array(array_id, GW_TRUE); if ( array == NULL ) { msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } init_state = msg->init_state; tasks = msg->number_of_tasks; for (i=0; i<msg->number_of_tasks; i++) { jid = array->job_ids[i]; job = gw_job_pool_get(jid, GW_TRUE); gw_job_fill(job, msg); if ( job == NULL ) { pthread_mutex_unlock(&(array->mutex)); msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } if ( init_state == GW_JOB_STATE_PENDING ) gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE); else gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE); job->tm_state = GW_TM_STATE_INIT; job->em_state = GW_EM_STATE_INIT; job->user_id = uid; job->pstart = msg->pstart; job->pinc = msg->pinc; pthread_mutex_unlock (&(job->mutex)); if ( msg->jt.job_deps[0] != -1 ) gw_job_pool_dep_set(jid, msg->jt.job_deps); } gw_user_pool_inc_jobs(uid,msg->number_of_tasks - useradd); pthread_mutex_unlock(&(array->mutex)); msg->rc = GW_RC_SUCCESS; msg->array_id = array_id; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); gw_log_print("DM",'I',"New array %i allocated and initialized.\n",array_id); if ( init_state == GW_JOB_STATE_PENDING ) gw_dm_mad_array_schedule(&gw_dm.dm_mad[0], -1, array_id, GW_REASON_NONE, 0, uid, tasks); }
void gw_dm_kill_hard (void *_job_id) { gw_job_t * job; int job_id; int rt; int array_id; int task_id; gw_array_t * array; char conf_filename[2048]; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (KILL_HARD).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED",_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Hard Kill the job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_MIGR_PROLOG: case GW_JOB_STATE_MIGR_EPILOG: pthread_mutex_lock(&(job->history->next->host->mutex)); job->history->next->host->running_jobs--; pthread_mutex_unlock(&(job->history->next->host->mutex)); job->history->next->stats[EXIT_TIME] = time(NULL); case GW_JOB_STATE_PROLOG: pthread_mutex_lock(&(job->history->host->mutex)); job->history->host->used_slots--; pthread_mutex_unlock(&(job->history->host->mutex)); case GW_JOB_STATE_EPILOG: case GW_JOB_STATE_EPILOG_STD: case GW_JOB_STATE_EPILOG_RESTART: case GW_JOB_STATE_EPILOG_FAIL: job->history->reason = GW_REASON_KILL; case GW_JOB_STATE_STOP_EPILOG: case GW_JOB_STATE_KILL_EPILOG: pthread_mutex_lock(&(job->history->host->mutex)); job->history->host->running_jobs--; pthread_mutex_unlock(&(job->history->host->mutex)); job->exit_time = time(NULL); job->history->stats[EXIT_TIME] = time(NULL); job->tm_state = GW_TM_STATE_HARD_KILL; if (job->history != NULL) { gw_log_print("DM",'I',"Cancelling prolog/epilog transfers of job %i.\n", job_id); gw_tm_mad_end(job->history->tm_mad, job->id); } break; case GW_JOB_STATE_PRE_WRAPPER: case GW_JOB_STATE_WRAPPER: job->history->reason = GW_REASON_KILL; pthread_mutex_lock(&(job->history->host->mutex)); job->history->host->used_slots--; job->history->host->running_jobs--; pthread_mutex_unlock(&(job->history->host->mutex)); job->exit_time = time(NULL); job->history->stats[EXIT_TIME] = time(NULL); job->em_state = GW_EM_STATE_HARD_KILL; if (job->history != NULL) { gw_log_print("DM",'I',"Cancelling execution of job %i.\n", job_id); gw_em_mad_cancel(job->history->em_mad, job_id); } break; case GW_JOB_STATE_MIGR_CANCEL: pthread_mutex_lock(&(job->history->next->host->mutex)); job->history->next->host->used_slots--; job->history->next->host->running_jobs--; pthread_mutex_unlock(&(job->history->next->host->mutex)); job->history->next->stats[EXIT_TIME] = time(NULL); job->history->reason = GW_REASON_KILL; case GW_JOB_STATE_STOP_CANCEL: case GW_JOB_STATE_KILL_CANCEL: pthread_mutex_lock(&(job->history->host->mutex)); job->history->host->used_slots--; job->history->host->running_jobs--; pthread_mutex_unlock(&(job->history->host->mutex)); job->exit_time = time(NULL); job->history->stats[EXIT_TIME] = time(NULL); job->em_state = GW_EM_STATE_HARD_KILL; break; case GW_JOB_STATE_INIT: case GW_JOB_STATE_PENDING: case GW_JOB_STATE_HOLD: case GW_JOB_STATE_STOPPED: job->exit_time = time(NULL); break; case GW_JOB_STATE_FAILED: case GW_JOB_STATE_ZOMBIE: break; default: gw_log_print("DM",'W',"Job %i can not be killed in current state.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED", _job_id); pthread_mutex_unlock(&(job->mutex)); return; } array_id = job->array_id; task_id = job->task_id; sprintf(conf_filename, "%s/job.conf", job->directory); unlink(conf_filename); pthread_mutex_unlock(&(job->mutex)); gw_job_pool_free(job_id); gw_log_print("DM",'I',"Job %i killed (hard) and freed.\n", job_id); if (array_id != -1) { array = gw_array_pool_get_array(array_id, GW_TRUE); if ( array != NULL ) { rt = gw_array_del_task(array,task_id); pthread_mutex_unlock(&(array->mutex)); if (rt == 0) { gw_array_pool_array_free(array_id); gw_log_print("DM",'I',"Array %i freed.\n",array_id); } } else gw_log_print("DM",'E',"Array %i does not exisit (KILL - task %i).\n", array_id, task_id); } gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_SUCCESS", _job_id); }
void gw_dm_kill (void *_job_id) { gw_job_t * job; int job_id; int rt; int array_id; int task_id; gw_array_t * array; char conf_filename[2048]; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (KILL).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED", _job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Kill the job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_INIT: case GW_JOB_STATE_PENDING: case GW_JOB_STATE_HOLD: case GW_JOB_STATE_STOPPED: job->exit_time = time(NULL); case GW_JOB_STATE_FAILED: case GW_JOB_STATE_ZOMBIE: array_id = job->array_id; task_id = job->task_id; sprintf(conf_filename, "%s/job.conf", job->directory); unlink(conf_filename); pthread_mutex_unlock(&(job->mutex)); gw_job_pool_free(job_id); gw_log_print("DM",'I',"Job %i killed and freed.\n", job_id); if (array_id != -1) { array = gw_array_pool_get_array(array_id, GW_TRUE); if ( array != NULL ) { rt = gw_array_del_task(array,task_id); pthread_mutex_unlock(&(array->mutex)); if (rt == 0) { gw_array_pool_array_free(array_id); gw_log_print("DM",'I',"Array %i freed.\n",array_id); } } else gw_log_print("DM",'E',"Array %i does not exisit (KILL - task %i).\n", array_id, task_id); } gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_SUCCESS", _job_id); break; case GW_JOB_STATE_WRAPPER: if (job->history != NULL ) job->history->reason = GW_REASON_KILL; gw_log_print("DM",'I',"Killing job %i.\n", job_id); gw_job_set_state(job, GW_JOB_STATE_KILL_CANCEL, GW_FALSE); gw_am_trigger(gw_dm.em_am, "GW_EM_CANCEL", _job_id); pthread_mutex_unlock(&(job->mutex)); break; default: gw_log_print("DM",'W',"Job %i can not be killed in current state.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED", _job_id); pthread_mutex_unlock(&(job->mutex)); break; } }
void gw_dm_wrapper_done_cb ( void *_job_id ) { gw_job_t * job; int job_id; time_t total; time_t active; time_t suspension; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_DONE_CB).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set execution times & state transition */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; switch (job->job_state) { case GW_JOB_STATE_PRE_WRAPPER: /* --------------- Update pre-wrapper stats -------------------- */ job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_pre_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Pre-Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Transition to Wrapper state ------------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_WRAPPER", _job_id); break; case GW_JOB_STATE_WRAPPER: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host, job->template.np);
void gw_dm_wrapper_failed_cb ( void *_job_id ) { gw_job_t * job; int job_id; time_t total; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_FAILED_CB).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set execution times & state transition */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; switch (job->job_state) { case GW_JOB_STATE_PRE_WRAPPER: /* --------------- Update pre-wrapper stats -------------------- */ job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_pre_wrapper_time(job->history); gw_job_print(job,"DM",'E',"Pre-Wrapper failed:\n"); gw_job_print(job,"DM",'E',"\tTotal time : %i\n", total); break; case GW_JOB_STATE_WRAPPER: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); gw_job_print(job,"DM",'E',"Wrapper failed:\n"); gw_job_print(job,"DM",'E',"\tTotal time : %i\n", total); /* ---------- We do not need to re-schedule this job --------- */ if ( job->reschedule == GW_TRUE ) { job->reschedule = GW_FALSE; gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id); } break; default: gw_log_print("DM",'E',"Wrapper failed callback in wrong job (%i) state.\n", job_id); break; } /* ----------------------------------------------------------- */ /* 1.- State transtition */ /* ----------------------------------------------------------- */ /* -------------- Free used slot from this host -------------- */ if (job->history != NULL) { job->history->reason = GW_REASON_EXECUTION_ERROR; gw_host_dec_uslots(job->history->host); } /* ----------------------------------------------------------- */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_FAIL", _job_id); pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_jalloc (void *_msg) { gw_msg_t * msg; gw_job_t * job; int jid; int uid; int rc; gw_boolean_t useradd; gw_job_state_t init_state; msg = (gw_msg_t *) _msg; /* ------------- Check if user is already registered ------------ */ useradd = gw_user_pool_exists (msg->owner, &uid) == GW_FALSE; if (useradd) { rc = gw_user_pool_user_allocate (msg->owner, &uid); if ( rc != 0 ) { gw_log_print("DM",'E',"Could not register user %s.\n", msg->owner); msg->rc = GW_RC_FAILED_USER; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } } /* ------------- Allocate job structure ------------ */ jid = gw_job_pool_allocate(); if ( jid == -1 ) { gw_log_print("DM",'E',"Could not allocate job.\n"); msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } /* ------------------------------------------ */ /* Update job data */ /* ------------------------------------------ */ job = gw_job_pool_get(jid, GW_TRUE); if ( job == NULL ) { msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } /* ------ Fill data using the template ------- */ rc = gw_job_fill(job, msg); if ( rc == -1 ) { gw_log_print("DM",'E',"Could not initialize job.\n"); pthread_mutex_unlock(&(job->mutex)); gw_job_pool_free(jid); msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } /* --------- Set the initial state ---------- */ init_state = msg->init_state; if ( init_state == GW_JOB_STATE_PENDING ) gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE); else gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE); job->tm_state = GW_TM_STATE_INIT; job->em_state = GW_EM_STATE_INIT; job->user_id = uid; pthread_mutex_unlock(&(job->mutex)); if (!useradd) gw_user_pool_inc_jobs(uid,1); if ( msg->jt.job_deps[0] != -1 ) gw_job_pool_dep_set(jid, msg->jt.job_deps); /* ------------- Callback msg ------------ */ msg->rc = GW_RC_SUCCESS; msg->array_id = -1; msg->job_id = jid; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); /* ------------- Notify the scheduler ------------ */ if ( init_state == GW_JOB_STATE_PENDING ) gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], jid, -1, GW_REASON_NONE, job->nice, uid); gw_log_print("DM",'I',"New job %i allocated and initialized.\n", jid); }