void gw_dm_hold (void *_job_id) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (HOLD).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_HOLD_FAILED", _job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Hold the job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_INIT: case GW_JOB_STATE_PENDING: gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE); gw_log_print("DM",'I',"Job %i held.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_HOLD_SUCCESS", _job_id); break; default: gw_log_print("DM",'W',"Job %i can not be held in current state.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_HOLD_FAILED", _job_id); break; } pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_failed ( void *_job_id ) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_FAILED).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set state */ /* ----------------------------------------------------------- */ gw_log_print("DM",'I',"Job %i failed.\n",job->id); gw_job_set_state(job, GW_JOB_STATE_FAILED, GW_FALSE); gw_job_print(job,"DM",'I',"Job failed, history:\n"); gw_job_print_history(job); job->exit_time = time(NULL); if (job->history != NULL) job->history->reason = GW_REASON_EXECUTION_ERROR; if ( job->client_waiting > 0 ) gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id); else free(_job_id); /* -------- Update Host & User running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); pthread_mutex_lock(&(job->history->host->mutex)); job->history->host->running_jobs--; pthread_mutex_unlock(&(job->history->host->mutex)); pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_wait (void *_job_id) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (WAIT).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_FAILED", _job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Wait for the job */ /* ----------------------------------------------------------- */ job->client_waiting++; switch (job->job_state) { case GW_JOB_STATE_ZOMBIE: case GW_JOB_STATE_FAILED: gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id); break; default: free(_job_id); break; } pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_kill_hard (void *_job_id) { gw_job_t * job; int job_id; int rt; int array_id; int task_id; gw_array_t * array; char conf_filename[GW_MSG_STRING_LONG]; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (KILL_HARD).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED",_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Hard Kill the job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_MIGR_PROLOG: case GW_JOB_STATE_MIGR_EPILOG: gw_host_dec_rjobs(job->history->next->host); job->history->next->stats[EXIT_TIME] = time(NULL); case GW_JOB_STATE_PROLOG: gw_host_dec_uslots(job->history->host, job->template.np);
void gw_dm_stopped ( void *_job_id ) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_STOPPED).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Update Job state */ /* ----------------------------------------------------------- */ gw_job_set_state(job, GW_JOB_STATE_STOPPED, GW_FALSE); /* -------- Update Host & User running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); gw_host_dec_rjobs(job->history->host); /* ----------------------------------------------------------- */ /* 2.- Notify Request Manager */ /* ----------------------------------------------------------- */ gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_SUCCESS", _job_id); pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_migr_cancel ( void *_job_id ) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_MGR_CANCEL).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Check we still need to migrate this job */ /* ----------------------------------------------------------- */ if ( (job->job_state == GW_JOB_STATE_WRAPPER) && (job->em_state != GW_EM_STATE_DONE)) { job->history->stats[MIGRATION_START_TIME] = time(NULL); gw_job_set_state(job, GW_JOB_STATE_MIGR_CANCEL, GW_FALSE); gw_am_trigger(gw_dm.em_am, "GW_EM_CANCEL", _job_id); } else { gw_log_print("DM",'W',"Can't migrate %i to in current state.\n",job->id); free(_job_id); } pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_wrapper ( void *_job_id ) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set state and times */ /* ----------------------------------------------------------- */ job->history->stats[WRAPPER_START_TIME] = time(NULL); gw_job_set_state(job, GW_JOB_STATE_WRAPPER, GW_FALSE); /* ----------------------------------------------------------- */ /* 2.- Signal the Execution Manager */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; gw_am_trigger(gw_dm.em_am, "GW_EM_SUBMIT", _job_id); pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_wrapper_failed_cb ( void *_job_id ) { gw_job_t * job; int job_id; time_t total; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_FAILED_CB).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set execution times & state transition */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; switch (job->job_state) { case GW_JOB_STATE_PRE_WRAPPER: /* --------------- Update pre-wrapper stats -------------------- */ job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_pre_wrapper_time(job->history); gw_job_print(job,"DM",'E',"Pre-Wrapper failed:\n"); gw_job_print(job,"DM",'E',"\tTotal time : %i\n", total); break; case GW_JOB_STATE_WRAPPER: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); gw_job_print(job,"DM",'E',"Wrapper failed:\n"); gw_job_print(job,"DM",'E',"\tTotal time : %i\n", total); /* ---------- We do not need to re-schedule this job --------- */ if ( job->reschedule == GW_TRUE ) { job->reschedule = GW_FALSE; gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id); } break; default: gw_log_print("DM",'E',"Wrapper failed callback in wrong job (%i) state.\n", job_id); break; } /* ----------------------------------------------------------- */ /* 1.- State transtition */ /* ----------------------------------------------------------- */ /* -------------- Free used slot from this host -------------- */ if (job->history != NULL) { job->history->reason = GW_REASON_EXECUTION_ERROR; gw_host_dec_uslots(job->history->host); } /* ----------------------------------------------------------- */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_FAIL", _job_id); pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_kill (void *_job_id) { gw_job_t * job; int job_id; int rt; int array_id; int task_id; gw_array_t * array; char conf_filename[2048]; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (KILL).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED", _job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Kill the job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_INIT: case GW_JOB_STATE_PENDING: case GW_JOB_STATE_HOLD: case GW_JOB_STATE_STOPPED: job->exit_time = time(NULL); case GW_JOB_STATE_FAILED: case GW_JOB_STATE_ZOMBIE: array_id = job->array_id; task_id = job->task_id; sprintf(conf_filename, "%s/job.conf", job->directory); unlink(conf_filename); pthread_mutex_unlock(&(job->mutex)); gw_job_pool_free(job_id); gw_log_print("DM",'I',"Job %i killed and freed.\n", job_id); if (array_id != -1) { array = gw_array_pool_get_array(array_id, GW_TRUE); if ( array != NULL ) { rt = gw_array_del_task(array,task_id); pthread_mutex_unlock(&(array->mutex)); if (rt == 0) { gw_array_pool_array_free(array_id); gw_log_print("DM",'I',"Array %i freed.\n",array_id); } } else gw_log_print("DM",'E',"Array %i does not exisit (KILL - task %i).\n", array_id, task_id); } gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_SUCCESS", _job_id); break; case GW_JOB_STATE_WRAPPER: if (job->history != NULL ) job->history->reason = GW_REASON_KILL; gw_log_print("DM",'I',"Killing job %i.\n", job_id); gw_job_set_state(job, GW_JOB_STATE_KILL_CANCEL, GW_FALSE); gw_am_trigger(gw_dm.em_am, "GW_EM_CANCEL", _job_id); pthread_mutex_unlock(&(job->mutex)); break; default: gw_log_print("DM",'W',"Job %i can not be killed in current state.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED", _job_id); pthread_mutex_unlock(&(job->mutex)); break; } }
void gw_dm_kill_hard (void *_job_id) { gw_job_t * job; int job_id; int rt; int array_id; int task_id; gw_array_t * array; char conf_filename[2048]; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (KILL_HARD).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED",_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Hard Kill the job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_MIGR_PROLOG: case GW_JOB_STATE_MIGR_EPILOG: pthread_mutex_lock(&(job->history->next->host->mutex)); job->history->next->host->running_jobs--; pthread_mutex_unlock(&(job->history->next->host->mutex)); job->history->next->stats[EXIT_TIME] = time(NULL); case GW_JOB_STATE_PROLOG: pthread_mutex_lock(&(job->history->host->mutex)); job->history->host->used_slots--; pthread_mutex_unlock(&(job->history->host->mutex)); case GW_JOB_STATE_EPILOG: case GW_JOB_STATE_EPILOG_STD: case GW_JOB_STATE_EPILOG_RESTART: case GW_JOB_STATE_EPILOG_FAIL: job->history->reason = GW_REASON_KILL; case GW_JOB_STATE_STOP_EPILOG: case GW_JOB_STATE_KILL_EPILOG: pthread_mutex_lock(&(job->history->host->mutex)); job->history->host->running_jobs--; pthread_mutex_unlock(&(job->history->host->mutex)); job->exit_time = time(NULL); job->history->stats[EXIT_TIME] = time(NULL); job->tm_state = GW_TM_STATE_HARD_KILL; if (job->history != NULL) { gw_log_print("DM",'I',"Cancelling prolog/epilog transfers of job %i.\n", job_id); gw_tm_mad_end(job->history->tm_mad, job->id); } break; case GW_JOB_STATE_PRE_WRAPPER: case GW_JOB_STATE_WRAPPER: job->history->reason = GW_REASON_KILL; pthread_mutex_lock(&(job->history->host->mutex)); job->history->host->used_slots--; job->history->host->running_jobs--; pthread_mutex_unlock(&(job->history->host->mutex)); job->exit_time = time(NULL); job->history->stats[EXIT_TIME] = time(NULL); job->em_state = GW_EM_STATE_HARD_KILL; if (job->history != NULL) { gw_log_print("DM",'I',"Cancelling execution of job %i.\n", job_id); gw_em_mad_cancel(job->history->em_mad, job_id); } break; case GW_JOB_STATE_MIGR_CANCEL: pthread_mutex_lock(&(job->history->next->host->mutex)); job->history->next->host->used_slots--; job->history->next->host->running_jobs--; pthread_mutex_unlock(&(job->history->next->host->mutex)); job->history->next->stats[EXIT_TIME] = time(NULL); job->history->reason = GW_REASON_KILL; case GW_JOB_STATE_STOP_CANCEL: case GW_JOB_STATE_KILL_CANCEL: pthread_mutex_lock(&(job->history->host->mutex)); job->history->host->used_slots--; job->history->host->running_jobs--; pthread_mutex_unlock(&(job->history->host->mutex)); job->exit_time = time(NULL); job->history->stats[EXIT_TIME] = time(NULL); job->em_state = GW_EM_STATE_HARD_KILL; break; case GW_JOB_STATE_INIT: case GW_JOB_STATE_PENDING: case GW_JOB_STATE_HOLD: case GW_JOB_STATE_STOPPED: job->exit_time = time(NULL); break; case GW_JOB_STATE_FAILED: case GW_JOB_STATE_ZOMBIE: break; default: gw_log_print("DM",'W',"Job %i can not be killed in current state.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED", _job_id); pthread_mutex_unlock(&(job->mutex)); return; } array_id = job->array_id; task_id = job->task_id; sprintf(conf_filename, "%s/job.conf", job->directory); unlink(conf_filename); pthread_mutex_unlock(&(job->mutex)); gw_job_pool_free(job_id); gw_log_print("DM",'I',"Job %i killed (hard) and freed.\n", job_id); if (array_id != -1) { array = gw_array_pool_get_array(array_id, GW_TRUE); if ( array != NULL ) { rt = gw_array_del_task(array,task_id); pthread_mutex_unlock(&(array->mutex)); if (rt == 0) { gw_array_pool_array_free(array_id); gw_log_print("DM",'I',"Array %i freed.\n",array_id); } } else gw_log_print("DM",'E',"Array %i does not exisit (KILL - task %i).\n", array_id, task_id); } gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_SUCCESS", _job_id); }
void gw_dm_jalloc(void *_msg) { gw_msg_submit_t * msg_submit; gw_job_t * job; int jid; int uid; int rc; gw_boolean_t useradd; gw_job_state_t init_state; msg_submit = (gw_msg_submit_t *) _msg; /* ------------- Check if user is already registered ------------ */ useradd = gw_user_pool_exists (msg_submit->msg.owner, msg_submit->msg.proxy_path, &uid) == GW_FALSE; if (useradd) { rc = gw_user_pool_user_allocate (msg_submit->msg.owner, msg_submit->msg.proxy_path, &uid); #ifdef GWDMDEBUG gw_log_print("DM",'D',"User %s registered with UID %i.\n", msg_submit->msg.owner, uid); #endif if ( rc != 0 ) { gw_log_print("DM",'E',"Could not register user %s.\n", msg_submit->msg.owner); msg_submit->msg.rc = GW_RC_FAILED_USER; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg)); return; } } /* ------------- Allocate job structure ------------ */ jid = gw_job_pool_allocate(); if ( jid == -1 ) { gw_log_print("DM",'E',"Could not allocate job.\n"); msg_submit->msg.rc = GW_RC_FAILED_NO_MEMORY; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg)); return; } /* ------------------------------------------ */ /* Update job data */ /* ------------------------------------------ */ job = gw_job_pool_get(jid, GW_TRUE); if ( job == NULL ) { msg_submit->msg.rc = GW_RC_FAILED_BAD_JOB_ID; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg)); return; } /* ------ Fill data using the template ------- */ rc = gw_job_fill(job, msg_submit); if ( rc == -1 ) { gw_log_print("DM",'E',"Could not initialize job.\n"); pthread_mutex_unlock(&(job->mutex)); gw_job_pool_free(jid); msg_submit->msg.rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg)); return; } /* --------- Set the initial state ---------- */ init_state = msg_submit->msg.init_state; if ( init_state == GW_JOB_STATE_PENDING ) gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE); else gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE); job->tm_state = GW_TM_STATE_INIT; job->em_state = GW_EM_STATE_INIT; job->user_id = uid; /* --------- Set the initial priority ---------- */ job->fixed_priority = msg_submit->msg.fixed_priority; pthread_mutex_unlock(&(job->mutex)); /* ------------- Set job dependencies ------------ */ if ( msg_submit->jt.job_deps[0] != -1 ) gw_job_pool_dep_set(jid, msg_submit->jt.job_deps); if (!useradd) gw_user_pool_inc_jobs(uid,1); /* ------------- Callback msg ------------ */ msg_submit->msg.rc = GW_RC_SUCCESS; msg_submit->msg.array_id = -1; msg_submit->msg.job_id = jid; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg)); /* ------------- Notify the scheduler ------------ */ if ( init_state == GW_JOB_STATE_PENDING ) gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], jid, -1, uid, GW_REASON_NONE); gw_log_print("DM",'I',"New job %i allocated and initialized.\n", jid); }
void gw_dm_reschedule (void *_job_id) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (RE-SCHEDULE).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_FAILED", _job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- re-schedule job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_WRAPPER: job->reschedule = GW_TRUE; job->history->reason = GW_REASON_USER_REQUESTED; gw_log_print("DM",'I',"Job %i will be re-scheduled.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_SUCCESS",_job_id); gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], job_id, job->array_id, job->user_id, GW_REASON_USER_REQUESTED); break; case GW_JOB_STATE_FAILED: gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE); job->tm_state = GW_TM_STATE_INIT; job->em_state = GW_EM_STATE_INIT; job->history->reason = GW_REASON_USER_REQUESTED; job->restarted++; gw_log_print("DM",'I',"Job %i will be re-scheduled.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_SUCCESS", _job_id); gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], job_id, job->array_id, job->user_id, GW_REASON_NONE); break; default: gw_log_print("DM",'I',"Job %i can not be re-scheduled in current state.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_FAILED", _job_id); break; } pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_wrapper_done_cb ( void *_job_id ) { gw_job_t * job; int job_id; time_t total; time_t active; time_t suspension; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_DONE_CB).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set execution times & state transition */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; switch (job->job_state) { case GW_JOB_STATE_PRE_WRAPPER: /* --------------- Update pre-wrapper stats -------------------- */ job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_pre_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Pre-Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Transition to Wrapper state ------------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_WRAPPER", _job_id); break; case GW_JOB_STATE_WRAPPER: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host, job->template.np);
void gw_dm_stop (void *_job_id) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (STOP).\n",job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_FAILED", _job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Stop the job */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_STOPPED: gw_log_print("DM",'W',"Job %i already stopped.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_FAILED", _job_id); break; case GW_JOB_STATE_WRAPPER: if (job->history != NULL ) job->history->reason = GW_REASON_STOP_RESUME; gw_log_print("DM",'I',"Stopping job %i.\n", job_id); gw_job_set_state(job, GW_JOB_STATE_STOP_CANCEL, GW_FALSE); if ( job->reschedule == GW_TRUE ) { job->reschedule = GW_FALSE; gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id); } gw_am_trigger(gw_dm.em_am, "GW_EM_CANCEL", _job_id); break; default: gw_log_print("DM",'W',"Job %i can not be stopped in current state.\n", job_id); gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_FAILED", _job_id); break; } pthread_mutex_unlock(&(job->mutex)); }
void gw_em_timer() { int i; gw_job_t *job; time_t now; static int mark = 0; int *_job_id; gw_em_mad_t *mad; mark = mark + GW_EM_TIMER_PERIOD; if ( mark >= 300 ) { gw_log_print("EM",'I',"-- MARK --\n"); mark = 0; } now = time(NULL); for (i= 0; i< gw_conf.number_of_jobs ; i++) { job = gw_job_pool_get(i, GW_TRUE); if ( job != NULL ) { if ( job->history == NULL ) { pthread_mutex_unlock(&(job->mutex)); continue; } if ( (job->job_state == GW_JOB_STATE_PRE_WRAPPER) || (job->job_state == GW_JOB_STATE_WRAPPER) || (job->job_state == GW_JOB_STATE_MIGR_CANCEL) || (job->job_state == GW_JOB_STATE_STOP_CANCEL) || (job->job_state == GW_JOB_STATE_KILL_CANCEL)) { if (issubmitted(job->em_state)) { if ( now >= job->next_poll_time ) { gw_log_print("EM",'I',"Checking execution state of job %i.\n", i); mad = job->history->em_mad; /* Warning! When in Migration Cancel, the previous MAD should be used */ if (job->job_state == GW_JOB_STATE_MIGR_CANCEL) { if (job->history->next == NULL) { gw_log_print("EM",'E',"Previous history record of job %i no longer exists\n", i); pthread_mutex_unlock(&(job->mutex)); continue; } else mad = job->history->next->em_mad; } gw_em_mad_poll(mad, i); job->next_poll_time += gw_conf.poll_interval; /* Wait for next poll */ } } else if ((job->em_state == GW_EM_STATE_FAILED) && (job->history->counter != -1)) { job->history->counter--; if (job->history->counter == 0) { job->history->counter = -1; _job_id = (int *) malloc (sizeof(int)); *(_job_id) = i; gw_am_trigger(&(gw_em.am),"GW_EM_SUBMIT", _job_id); } } } pthread_mutex_unlock(&(job->mutex)); } } }
void gw_dm_aalloc (void *_msg) { gw_msg_submit_t *msg_submit; gw_array_t *array; gw_job_t *job; gw_boolean_t useradd; gw_job_state_t init_state; int fixed; int tasks; int rc; int array_id, i, jid, uid; msg_submit = (gw_msg_submit_t *) _msg; useradd = gw_user_pool_exists (msg_submit->msg.owner, msg_submit->msg.proxy_path, &uid) == GW_FALSE; if (useradd) { rc = gw_user_pool_user_allocate (msg_submit->msg.owner, msg_submit->msg.proxy_path, &uid); if ( rc != 0 ) { gw_log_print("DM",'E',"Could not register user %s.\n", msg_submit->msg.owner); msg_submit->msg.rc = GW_RC_FAILED_USER; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg)); return; } } rc = gw_array_pool_array_allocate(&(msg_submit->msg), msg_submit->msg.number_of_tasks, &array_id); if ( rc != 0 ) { gw_log_print("DM",'E',"Could not allocate array.\n"); msg_submit->msg.rc = GW_RC_FAILED_NO_MEMORY; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg)); return; } array = gw_array_pool_get_array(array_id, GW_TRUE); if ( array == NULL ) { msg_submit->msg.rc = GW_RC_FAILED_BAD_ARRAY_ID; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg)); return; } init_state = msg_submit->msg.init_state; tasks = msg_submit->msg.number_of_tasks; for (i=0; i<msg_submit->msg.number_of_tasks; i++) { jid = array->job_ids[i]; job = gw_job_pool_get(jid, GW_TRUE); fixed = msg_submit->msg.fixed_priority; gw_job_fill(job, msg_submit); if ( job == NULL ) { pthread_mutex_unlock(&(array->mutex)); msg_submit->msg.rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg)); return; } /* --------- Set the initial state ---------- */ if ( init_state == GW_JOB_STATE_PENDING ) gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE); else gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE); job->tm_state = GW_TM_STATE_INIT; job->em_state = GW_EM_STATE_INIT; job->user_id = uid; /* --------- Set the parameter values ---------- */ job->pstart = msg_submit->msg.pstart; job->pinc = msg_submit->msg.pinc; /* --------- Set the initial priority ---------- */ job->fixed_priority = msg_submit->msg.fixed_priority; pthread_mutex_unlock (&(job->mutex)); /* ------------- Notify the scheduler ------------ */ if ( init_state == GW_JOB_STATE_PENDING ) gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], jid, array_id, uid, GW_REASON_NONE); /* ------------- Set job dependencies ------------ */ if ( msg_submit->jt.job_deps[0] != -1 ) gw_job_pool_dep_set(jid, msg_submit->jt.job_deps); } gw_user_pool_inc_jobs(uid,msg_submit->msg.number_of_tasks - useradd); pthread_mutex_unlock(&(array->mutex)); msg_submit->msg.rc = GW_RC_SUCCESS; msg_submit->msg.array_id = array_id; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg)); gw_log_print("DM",'I',"New array %i allocated and initialized.\n",array_id); }
void gw_em_listener(void *arg) { fd_set in_pipes; int i,j; int *job_id; int greater, rc, rcm; char c; char info[GW_EM_MAX_INFO]; char s_job_id[GW_EM_MAX_JOB_ID]; char result[GW_EM_MAX_RESULT]; char action[GW_EM_MAX_ACTION]; char str[GW_EM_MAX_STRING]; int fd; gw_job_t *job; time_t now; char contact_file[PATH_MAX]; FILE *file; gw_em_mad_t *em_mad; char *ptmp; int *fds; int num_fds; gw_em_mad_t **em_mads; pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); fds = (int *) malloc(sizeof(int)*gw_conf.number_of_users*GW_MAX_MADS); em_mads = (gw_em_mad_t **) malloc(sizeof(gw_em_mad_t *) * gw_conf.number_of_users * GW_MAX_MADS); while (1) { greater = gw_user_pool_set_em_pipes(&in_pipes, fds, &num_fds, em_mads, gw_em.um_em_pipe_r); rc = select( greater+1, &in_pipes, NULL, NULL, NULL); if ( rc <= 0 ) continue; for (i=0; i<num_fds; i++) { fd = fds[i]; if ( FD_ISSET(fd, &in_pipes) ) { if ( fd == gw_em.um_em_pipe_r ) { rc = read(fd, (void *) &c, sizeof(char)); #ifdef GWEMDEBUG gw_log_print("EM",'D',"Updating MAD pipes (action is %c)\n",c); #endif continue; } #ifdef GWEMDEBUG gw_log_print("EM",'D',"Reading from MAD pipe %i.\n",i); #endif j = 0; do { rc = read(fd, (void *) &c, sizeof(char)); str[j++] = c; } while ((rc > 0) && (c != '\n') && (j < (GW_EM_MAX_STRING-1))); str[j] = '\0'; if (rc <= 0) { gw_log_print("EM",'W',"Error reading MAD (%s) message\n", em_mads[i]->name); rcm = gw_em_mad_reload(em_mads[i]); if ( rcm == 0 ) { gw_log_print("EM",'I',"MAD (%s) successfully reloaded\n", em_mads[i]->name); gw_job_pool_em_recover(em_mads[i], &(gw_em.am)); } else { gw_log_print("EM",'E',"Error reloading MAD (%s)\n", em_mads[i]->name); em_mads[i]->mad_em_pipe = -1; } continue; } sscanf(str,"%" GW2STR(GW_EM_MAX_ACTION) "s %" GW2STR(GW_EM_MAX_JOB_ID) "s %" GW2STR(GW_EM_MAX_RESULT) "s %" GW2STR(GW_EM_MAX_INFO) "[^\n]", action, s_job_id, result, info); #ifdef GWEMDEBUG gw_log_print("EM",'D',"MAD message received:\"%s %s %s %s\".\n", action, s_job_id, result, info); #endif if (s_job_id[0] == '-') continue; job_id = (int *) malloc (sizeof(int)); *job_id = atoi(s_job_id); job = gw_job_pool_get(*job_id, GW_TRUE); if (job == NULL) { gw_log_print("EM",'W',"MAD message for job %s, but it does not exist: \"%s %s %s %s\".\n", s_job_id,action, s_job_id, result, info); free(job_id); continue; } if (job->job_state != GW_JOB_STATE_PRE_WRAPPER && job->job_state != GW_JOB_STATE_WRAPPER && job->job_state != GW_JOB_STATE_MIGR_CANCEL && job->job_state != GW_JOB_STATE_STOP_CANCEL && job->job_state != GW_JOB_STATE_KILL_CANCEL) { gw_log_print("EM",'W',"MAD message for job %i but not in an execution state.\n", *job_id); free(job_id); pthread_mutex_unlock(&(job->mutex)); continue; } else if ( job->em_state == GW_EM_STATE_HARD_KILL ) { gw_log_print("EM",'W',"MAD message for job %i but it is being killed (hard).\n", *job_id); free(job_id); pthread_mutex_unlock(&(job->mutex)); continue; } if (strcmp(action, "SUBMIT") == 0) { if (strcmp(result, "FAILURE") == 0) { gw_job_print(job, "EM",'E',"Job submission failed: %s\n", info); gw_log_print("EM",'E',"Submission of job %d failed: %s.\n", job->id, info); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED", (void *) job_id); } else /* Save persistent job contact */ { snprintf(contact_file, PATH_MAX-1, GW_VAR_DIR "/%i/job.contact", job->id); file = fopen(contact_file, "w"); if ( file != NULL ) { fprintf(file, "%s\n", info); fclose(file); } gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING", (void *) job_id); } } else if (strcmp(action, "CANCEL") == 0) { if (strcmp(result, "SUCCESS") == 0) { gw_job_print(job, "EM",'I',"Job cancel succeeded.\n"); gw_log_print("EM",'I',"Cancel of job %i succeeded.\n", *job_id); } else { gw_job_print(job, "EM",'E',"Job cancel failed (%s).\n",info); gw_log_print("EM",'E',"Cancel of job %d failed: %s.\n",job->id, info); } } else if (strcmp(action, "POLL") == 0) { if (strcmp(result, "SUCCESS") == 0) { now = time(NULL); job->next_poll_time = now + gw_conf.poll_interval/2 + gw_rand(gw_conf.poll_interval); gw_job_print(job, "EM",'E',"Job poll OK (%s), will poll again in %d seconds.\n", info, job->next_poll_time - now); if (strcmp(info, "PENDING") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING", (void *) job_id); else if (strcmp(info, "SUSPENDED") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_SUSPENDED", (void *) job_id); else if (strcmp(info, "ACTIVE") == 0) gw_am_trigger(&(gw_em.am),"GW_EM_STATE_ACTIVE", (void *) job_id); else if (strstr(info, "DONE") != NULL) { ptmp = strstr(info,"DONE:"); if ((ptmp != NULL) && (strlen(ptmp+5) > 0))/*No-wrapper mode*/ job->exit_code=atoi(ptmp+5); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } else if (strcmp(info, "FAILED") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED", (void *) job_id); } else { job->history->failed_polls++; em_mad = job->history->em_mad; if ( job->history->failed_polls == 3 ) { gw_job_print(job, "EM",'E',"Job poll failed (%s), assuming the job is done.\n",info); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } else { now = time(NULL); job->next_poll_time = now + gw_conf.poll_interval*job->history->failed_polls + gw_rand(gw_conf.poll_interval*job->history->failed_polls); gw_job_print(job, "EM",'E',"Job poll failed (%s), will poll again in %d seconds.\n", info, job->next_poll_time - now); free(job_id); } } } else if (strcmp(action, "RECOVER") == 0) { if (strcmp(result, "SUCCESS") == 0) { if (strcmp(info, "PENDING") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING", (void *) job_id); else if (strcmp(info, "SUSPENDED") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_SUSPENDED", (void *) job_id); else if (strcmp(info, "ACTIVE") == 0) gw_am_trigger(&(gw_em.am),"GW_EM_STATE_ACTIVE", (void *) job_id); else if (strstr(info, "DONE") != NULL) { ptmp = strstr(info,"DONE:"); if ((ptmp != NULL) && (strlen(ptmp+5) > 0))/*No-wrapper mode*/ job->exit_code = atoi(ptmp+5); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } else if (strcmp(info, "FAILED") == 0) { /* Do not retry */ job->history->tries= job->template.number_of_retries; gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED", (void *) job_id); } } else { gw_job_print(job,"EM",'E',"Job recover failed (%s), assuming the job is done.\n", info); gw_log_print("EM",'E',"Recover of job %i failed.\n", *job_id); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } }
void gw_dm_wrapper_done_cb ( void *_job_id ) { gw_job_t * job; int job_id; time_t total; time_t active; time_t suspension; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_DONE_CB).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set execution times & state transition */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; switch (job->job_state) { case GW_JOB_STATE_PRE_WRAPPER: /* --------------- Update pre-wrapper stats -------------------- */ job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_pre_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Pre-Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Transition to Wrapper state ------------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_WRAPPER", _job_id); break; case GW_JOB_STATE_WRAPPER: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host); /* ---------- We do not need to re-schedule this job --------- */ if ( job->reschedule == GW_TRUE ) { job->reschedule = GW_FALSE; gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id); } /* -------------- Transition to Epilog state ------------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_STD", _job_id); break; case GW_JOB_STATE_STOP_CANCEL: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host); /* ------------ Transition to Stop Epilog state --------------- */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_STOP_EPILOG", _job_id); break; case GW_JOB_STATE_KILL_CANCEL: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host); /* ------------ Transition to Kill Epilog state ---------------- */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_KILL_EPILOG", _job_id); break; case GW_JOB_STATE_MIGR_CANCEL: /* ----------- Update previous wrapper stats ------------------- */ job->history->next->stats[WRAPPER_EXIT_TIME] = time(NULL); active = job->history->next->stats[ACTIVE_TIME]; suspension = job->history->next->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); /* -------------- Free used slot from previous host ------------ */ gw_host_dec_uslots(job->history->next->host); /* ---------- Transition to Migration Prolog state ------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_MIGR_PROLOG", _job_id); break; default: gw_log_print("DM",'E',"Wrapper done callback in wrong job (%i) state.\n", job_id); free(_job_id); break; } pthread_mutex_unlock(&(job->mutex)); }
void gw_em_submit(void *_job_id) { int job_id; gw_job_t *job; char *rsl=NULL; char *contact; gw_job_state_t state; char rsl_filename[2048]; FILE *fd; time_t now; /* ----------------------------------------------------------- */ /* 0.- Get job pointer, check if it exits and lock mutex */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("EM",'E',"Job %s no longer exists (PENDING).\n", job_id); return; } } else return; if (job->history == NULL) { gw_log_print("EM",'E',"History of job %s doesn't exists\n", job_id); free(_job_id); pthread_mutex_unlock(&(job->mutex)); return; } state = job->job_state; /* ----------------------------------------------------------- */ /* 1.- Get execution MAD for this host */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; job->history->counter = -1; if ( job->job_state == GW_JOB_STATE_PRE_WRAPPER ) { contact = job->history->em_fork_rc; rsl = (char *) job->history->em_mad->pre_wrapper_rsl((void *) job); } else { contact = job->history->em_rc; rsl = (char *) job->history->em_mad->wrapper_rsl((void *) job); } if ( rsl == NULL ) { job->em_state = GW_EM_STATE_FAILED; gw_log_print("EM",'E',"Job %i failed, could not generate RSL.\n", job_id); gw_am_trigger(gw_em.dm_am, "GW_DM_WRAPPER_FAILED", _job_id); pthread_mutex_unlock(&(job->mutex)); return; } sprintf(rsl_filename, "%s/job.rsl.%i", job->directory,job->restarted); fd = fopen(rsl_filename,"w"); if (fd != NULL ) { gw_job_print(job,"EM",'I',"Submitting wrapper to %s, RSL used is in %s.\n",contact,rsl_filename); fprintf(fd,"%s",rsl); fclose(fd); } else { job->em_state = GW_EM_STATE_FAILED; gw_log_print("EM",'E',"Job %i failed, could not open RSL file.\n", job_id); gw_job_print(job,"EM",'E',"Job failed, could not open RSL file %s.\n",rsl_filename); gw_am_trigger(gw_em.dm_am, "GW_DM_WRAPPER_FAILED", _job_id); pthread_mutex_unlock(&(job->mutex)); return; } /* -------------------------------------------------------------------- */ now = time(NULL); job->next_poll_time = now + gw_conf.poll_interval/2 + gw_rand(gw_conf.poll_interval); /* randomize polls */ gw_job_print(job,"EM",'I',"Job will be polled in %d seconds.\n", job->next_poll_time-now); job->last_checkpoint_time = 0; job->history->stats[LAST_SUSPENSION_TIME] = now; job->history->stats[SUSPENSION_TIME] = 0; job->history->stats[ACTIVE_TIME] = 0; job->history->tries++; /* -------------------------------------------------------------------- */ pthread_mutex_unlock(&(job->mutex)); gw_em_mad_submit(job->history->em_mad, job_id, contact, rsl_filename); /* -------------------------------------------------------------------- */ free(_job_id); free(rsl); }
void gw_dm_jalloc (void *_msg) { gw_msg_t * msg; gw_job_t * job; int jid; int uid; int rc; gw_boolean_t useradd; gw_job_state_t init_state; msg = (gw_msg_t *) _msg; /* ------------- Check if user is already registered ------------ */ useradd = gw_user_pool_exists (msg->owner, &uid) == GW_FALSE; if (useradd) { rc = gw_user_pool_user_allocate (msg->owner, &uid); if ( rc != 0 ) { gw_log_print("DM",'E',"Could not register user %s.\n", msg->owner); msg->rc = GW_RC_FAILED_USER; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } } /* ------------- Allocate job structure ------------ */ jid = gw_job_pool_allocate(); if ( jid == -1 ) { gw_log_print("DM",'E',"Could not allocate job.\n"); msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } /* ------------------------------------------ */ /* Update job data */ /* ------------------------------------------ */ job = gw_job_pool_get(jid, GW_TRUE); if ( job == NULL ) { msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } /* ------ Fill data using the template ------- */ rc = gw_job_fill(job, msg); if ( rc == -1 ) { gw_log_print("DM",'E',"Could not initialize job.\n"); pthread_mutex_unlock(&(job->mutex)); gw_job_pool_free(jid); msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } /* --------- Set the initial state ---------- */ init_state = msg->init_state; if ( init_state == GW_JOB_STATE_PENDING ) gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE); else gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE); job->tm_state = GW_TM_STATE_INIT; job->em_state = GW_EM_STATE_INIT; job->user_id = uid; pthread_mutex_unlock(&(job->mutex)); if (!useradd) gw_user_pool_inc_jobs(uid,1); if ( msg->jt.job_deps[0] != -1 ) gw_job_pool_dep_set(jid, msg->jt.job_deps); /* ------------- Callback msg ------------ */ msg->rc = GW_RC_SUCCESS; msg->array_id = -1; msg->job_id = jid; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); /* ------------- Notify the scheduler ------------ */ if ( init_state == GW_JOB_STATE_PENDING ) gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], jid, -1, GW_REASON_NONE, job->nice, uid); gw_log_print("DM",'I',"New job %i allocated and initialized.\n", jid); }
void gw_dm_aalloc (void *_msg) { gw_msg_t *msg; gw_array_t *array; gw_job_t *job; gw_boolean_t useradd; gw_job_state_t init_state; int tasks; int rc; int array_id, i, jid, uid; msg = (gw_msg_t *) _msg; useradd = gw_user_pool_exists (msg->owner, &uid) == GW_FALSE; if (useradd) { rc = gw_user_pool_user_allocate (msg->owner, &uid); if ( rc != 0 ) { gw_log_print("DM",'E',"Could not register user %s.\n", msg->owner); msg->rc = GW_RC_FAILED_USER; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } } rc = gw_array_pool_array_allocate(msg, msg->number_of_tasks, &array_id); if ( rc != 0 ) { gw_log_print("DM",'E',"Could not allocate array.\n"); msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } array = gw_array_pool_get_array(array_id, GW_TRUE); if ( array == NULL ) { msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } init_state = msg->init_state; tasks = msg->number_of_tasks; for (i=0; i<msg->number_of_tasks; i++) { jid = array->job_ids[i]; job = gw_job_pool_get(jid, GW_TRUE); gw_job_fill(job, msg); if ( job == NULL ) { pthread_mutex_unlock(&(array->mutex)); msg->rc = GW_RC_FAILED; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); return; } if ( init_state == GW_JOB_STATE_PENDING ) gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE); else gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE); job->tm_state = GW_TM_STATE_INIT; job->em_state = GW_EM_STATE_INIT; job->user_id = uid; job->pstart = msg->pstart; job->pinc = msg->pinc; pthread_mutex_unlock (&(job->mutex)); if ( msg->jt.job_deps[0] != -1 ) gw_job_pool_dep_set(jid, msg->jt.job_deps); } gw_user_pool_inc_jobs(uid,msg->number_of_tasks - useradd); pthread_mutex_unlock(&(array->mutex)); msg->rc = GW_RC_SUCCESS; msg->array_id = array_id; gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg); gw_log_print("DM",'I',"New array %i allocated and initialized.\n",array_id); if ( init_state == GW_JOB_STATE_PENDING ) gw_dm_mad_array_schedule(&gw_dm.dm_mad[0], -1, array_id, GW_REASON_NONE, 0, uid, tasks); }
void gw_dm_zombie ( void *_job_id ) { gw_job_t * job; gw_array_t * array; int job_id; int task_id; int array_id; int rt; char conf_filename[2048]; time_t prolog, epilog; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id == NULL ) return; job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_ZOMBIE).\n",job_id); free(_job_id); return; } /* ----------------------------------------------------------- */ /* 0.- Update Job state */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_EPILOG: gw_job_set_state(job, GW_JOB_STATE_ZOMBIE, GW_FALSE); gw_log_print("DM",'I',"Job %i done, with exit code %i.\n",job->id, job->exit_code); job->history->reason = GW_REASON_NONE; job->exit_time = time(NULL); /* ------------- Print job history and send usage ------------ */ gw_job_print(job,"DM",'I',"Job done, history:\n"); gw_job_print_history(job); gw_job_send_usage(job); if ( job->client_waiting > 0 ) gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id); else { if (gw_conf.dispose == GW_TRUE) gw_am_trigger(&(gw_dm.am), "GW_DM_KILL", _job_id); else free(_job_id); } /* -------- Update User & Host running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); gw_host_dec_rjobs(job->history->host); /* -------- Notify the scheduler -------- */ prolog = gw_job_history_get_prolog_time(job->history); epilog = gw_job_history_get_epilog_time(job->history); gw_dm_mad_job_success(&gw_dm.dm_mad[0], job->history->host->host_id, job->user_id, (prolog + epilog), job->history->stats[SUSPENSION_TIME], job->history->stats[ACTIVE_TIME]); pthread_mutex_unlock(&(job->mutex)); /* -------- Update other jobs dependencies -------- */ gw_job_pool_dep_check(job_id); break; case GW_JOB_STATE_KILL_EPILOG: gw_job_set_state(job, GW_JOB_STATE_ZOMBIE, GW_FALSE); job->exit_time = time(NULL); /* ------------- Print job history and send usage ------------ */ gw_job_print(job,"DM",'I',"Job killed, history:\n"); gw_job_print_history(job); gw_job_send_usage(job); /* ---------------- Free job & Notify RM ---------------- */ array_id = job->array_id; task_id = job->task_id; /* -------- Update User & Host running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); gw_host_dec_rjobs(job->history->host); sprintf(conf_filename, "%s/job.conf", job->directory); unlink(conf_filename); pthread_mutex_unlock(&(job->mutex)); /* ------------------------------------------------- */ gw_job_pool_free(job_id); gw_log_print("DM",'I',"Job %i killed and freed.\n", job_id); if (array_id != -1) { array = gw_array_pool_get_array(array_id,GW_TRUE); if ( array != NULL ) { rt = gw_array_del_task(array,task_id); pthread_mutex_unlock(&(array->mutex)); if (rt == 0) { gw_array_pool_array_free(array_id); gw_log_print("DM",'I',"Array %i freed\n",array_id); } } else gw_log_print("DM",'E',"Could not delete task %i from array %i.\n", task_id, array_id); } gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_SUCCESS", _job_id); break; default: gw_log_print("DM",'E',"Zombie callback in wrong job (%i) state.\n", job_id); free(_job_id); pthread_mutex_unlock(&(job->mutex)); break; } }