void gw_dm_failed ( void *_job_id ) { gw_job_t * job; int job_id; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_FAILED).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set state */ /* ----------------------------------------------------------- */ gw_log_print("DM",'I',"Job %i failed.\n",job->id); gw_job_set_state(job, GW_JOB_STATE_FAILED, GW_FALSE); gw_job_print(job,"DM",'I',"Job failed, history:\n"); gw_job_print_history(job); job->exit_time = time(NULL); if (job->history != NULL) job->history->reason = GW_REASON_EXECUTION_ERROR; if ( job->client_waiting > 0 ) gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id); else free(_job_id); /* -------- Update Host & User running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); pthread_mutex_lock(&(job->history->host->mutex)); job->history->host->running_jobs--; pthread_mutex_unlock(&(job->history->host->mutex)); pthread_mutex_unlock(&(job->mutex)); }
void gw_em_cancel(void *_job_id) { int job_id; gw_job_t *job; gw_em_mad_t *mad; gw_em_state_t current_em_state; if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); free(_job_id); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("EM",'E',"Job %s no longer exists (CANCEL).\n", job_id); return; } } else return; /* -------------------------------------------------------------------- */ current_em_state = job->em_state; if ( issubmitted(current_em_state) ) { gw_job_print (job,"EM",'I',"Cancelling job.\n"); mad = job->history->em_mad; /* Warning! When in Migration Cancel, the previous MAD should be used */ if (job->job_state == GW_JOB_STATE_MIGR_CANCEL) { if (job->history->next == NULL) { gw_log_print("EM",'E',"Previous history record of job %i no longer exists\n", job_id); pthread_mutex_unlock(&(job->mutex)); return; } else mad = job->history->next->em_mad; } gw_em_mad_cancel(mad, job_id); gw_log_print ("EM",'I',"Cancelling job %i.\n", job_id); } else gw_log_print ("EM",'W',"Ignoring cancel request for job %i, will re-try.\n", job_id); /* -------------------------------------------------------------------- */ pthread_mutex_unlock(&(job->mutex)); }
int gw_tm_prolog_build_urls(gw_job_t * job, char * src, char * dst, char ** src_url, char ** dst_url) { int is_gsiftp; char url_buffer[1024]; char *tmp; if ( src[0] == '/' ) { gw_job_print(job,"TM",'W',"\tSkipping file %s, absolute path.\n",src); return 1; } is_gsiftp = strstr(src,"gsiftp://") != NULL; if ( is_gsiftp ) { *src_url = gw_job_substitute (src,job); if (*src_url == NULL ) { gw_job_print(job,"TM",'E',"\tSkipping file %s, parse error.\n",src); return -1; } if ( dst == NULL ) { tmp = strrchr(src,'/'); if ( tmp == NULL ) { gw_job_print(job,"TM",'E',"\tSkipping file %s, no file in url.\n",src); free(*src_url); return -1; } else tmp++; } else tmp = dst; snprintf(url_buffer,sizeof(char)*1024,"%s%s",job->history->rdir,tmp); *dst_url = gw_job_substitute (url_buffer, job); if ( dst_url == NULL ) { gw_job_print(job,"TM",'E',"\tSkipping file %s, parse error.\n",url_buffer); free(*src_url); return -1; } } else { if ( strstr(src,"file://") != NULL ) { strncpy(url_buffer, src, sizeof(char) * 1024); if ( dst == NULL ) { tmp = strrchr(src,'/'); if ( tmp == NULL ) { gw_job_print(job,"TM",'E',"\tSkipping file %s, no file in url.\n",src); return -1; } else tmp++; } else { tmp = dst; } } else { snprintf(url_buffer,sizeof(char)*1024,"file://%s/%s",job->template.job_home, src); if ( dst == NULL ) tmp = src; else tmp = dst; }
void gw_dm_wrapper_done_cb ( void *_job_id ) { gw_job_t * job; int job_id; time_t total; time_t active; time_t suspension; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_DONE_CB).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set execution times & state transition */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; switch (job->job_state) { case GW_JOB_STATE_PRE_WRAPPER: /* --------------- Update pre-wrapper stats -------------------- */ job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_pre_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Pre-Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Transition to Wrapper state ------------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_WRAPPER", _job_id); break; case GW_JOB_STATE_WRAPPER: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host, job->template.np);
void gw_em_listener(void *arg) { fd_set in_pipes; int i,j; int *job_id; int greater, rc, rcm; char c; char info[GW_EM_MAX_INFO]; char s_job_id[GW_EM_MAX_JOB_ID]; char result[GW_EM_MAX_RESULT]; char action[GW_EM_MAX_ACTION]; char str[GW_EM_MAX_STRING]; int fd; gw_job_t *job; time_t now; char contact_file[PATH_MAX]; FILE *file; gw_em_mad_t *em_mad; char *ptmp; int *fds; int num_fds; gw_em_mad_t **em_mads; pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); fds = (int *) malloc(sizeof(int)*gw_conf.number_of_users*GW_MAX_MADS); em_mads = (gw_em_mad_t **) malloc(sizeof(gw_em_mad_t *) * gw_conf.number_of_users * GW_MAX_MADS); while (1) { greater = gw_user_pool_set_em_pipes(&in_pipes, fds, &num_fds, em_mads, gw_em.um_em_pipe_r); rc = select( greater+1, &in_pipes, NULL, NULL, NULL); if ( rc <= 0 ) continue; for (i=0; i<num_fds; i++) { fd = fds[i]; if ( FD_ISSET(fd, &in_pipes) ) { if ( fd == gw_em.um_em_pipe_r ) { rc = read(fd, (void *) &c, sizeof(char)); #ifdef GWEMDEBUG gw_log_print("EM",'D',"Updating MAD pipes (action is %c)\n",c); #endif continue; } #ifdef GWEMDEBUG gw_log_print("EM",'D',"Reading from MAD pipe %i.\n",i); #endif j = 0; do { rc = read(fd, (void *) &c, sizeof(char)); str[j++] = c; } while ((rc > 0) && (c != '\n') && (j < (GW_EM_MAX_STRING-1))); str[j] = '\0'; if (rc <= 0) { gw_log_print("EM",'W',"Error reading MAD (%s) message\n", em_mads[i]->name); rcm = gw_em_mad_reload(em_mads[i]); if ( rcm == 0 ) { gw_log_print("EM",'I',"MAD (%s) successfully reloaded\n", em_mads[i]->name); gw_job_pool_em_recover(em_mads[i], &(gw_em.am)); } else { gw_log_print("EM",'E',"Error reloading MAD (%s)\n", em_mads[i]->name); em_mads[i]->mad_em_pipe = -1; } continue; } sscanf(str,"%" GW2STR(GW_EM_MAX_ACTION) "s %" GW2STR(GW_EM_MAX_JOB_ID) "s %" GW2STR(GW_EM_MAX_RESULT) "s %" GW2STR(GW_EM_MAX_INFO) "[^\n]", action, s_job_id, result, info); #ifdef GWEMDEBUG gw_log_print("EM",'D',"MAD message received:\"%s %s %s %s\".\n", action, s_job_id, result, info); #endif if (s_job_id[0] == '-') continue; job_id = (int *) malloc (sizeof(int)); *job_id = atoi(s_job_id); job = gw_job_pool_get(*job_id, GW_TRUE); if (job == NULL) { gw_log_print("EM",'W',"MAD message for job %s, but it does not exist: \"%s %s %s %s\".\n", s_job_id,action, s_job_id, result, info); free(job_id); continue; } if (job->job_state != GW_JOB_STATE_PRE_WRAPPER && job->job_state != GW_JOB_STATE_WRAPPER && job->job_state != GW_JOB_STATE_MIGR_CANCEL && job->job_state != GW_JOB_STATE_STOP_CANCEL && job->job_state != GW_JOB_STATE_KILL_CANCEL) { gw_log_print("EM",'W',"MAD message for job %i but not in an execution state.\n", *job_id); free(job_id); pthread_mutex_unlock(&(job->mutex)); continue; } else if ( job->em_state == GW_EM_STATE_HARD_KILL ) { gw_log_print("EM",'W',"MAD message for job %i but it is being killed (hard).\n", *job_id); free(job_id); pthread_mutex_unlock(&(job->mutex)); continue; } if (strcmp(action, "SUBMIT") == 0) { if (strcmp(result, "FAILURE") == 0) { gw_job_print(job, "EM",'E',"Job submission failed: %s\n", info); gw_log_print("EM",'E',"Submission of job %d failed: %s.\n", job->id, info); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED", (void *) job_id); } else /* Save persistent job contact */ { snprintf(contact_file, PATH_MAX-1, GW_VAR_DIR "/%i/job.contact", job->id); file = fopen(contact_file, "w"); if ( file != NULL ) { fprintf(file, "%s\n", info); fclose(file); } gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING", (void *) job_id); } } else if (strcmp(action, "CANCEL") == 0) { if (strcmp(result, "SUCCESS") == 0) { gw_job_print(job, "EM",'I',"Job cancel succeeded.\n"); gw_log_print("EM",'I',"Cancel of job %i succeeded.\n", *job_id); } else { gw_job_print(job, "EM",'E',"Job cancel failed (%s).\n",info); gw_log_print("EM",'E',"Cancel of job %d failed: %s.\n",job->id, info); } } else if (strcmp(action, "POLL") == 0) { if (strcmp(result, "SUCCESS") == 0) { now = time(NULL); job->next_poll_time = now + gw_conf.poll_interval/2 + gw_rand(gw_conf.poll_interval); gw_job_print(job, "EM",'E',"Job poll OK (%s), will poll again in %d seconds.\n", info, job->next_poll_time - now); if (strcmp(info, "PENDING") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING", (void *) job_id); else if (strcmp(info, "SUSPENDED") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_SUSPENDED", (void *) job_id); else if (strcmp(info, "ACTIVE") == 0) gw_am_trigger(&(gw_em.am),"GW_EM_STATE_ACTIVE", (void *) job_id); else if (strstr(info, "DONE") != NULL) { ptmp = strstr(info,"DONE:"); if ((ptmp != NULL) && (strlen(ptmp+5) > 0))/*No-wrapper mode*/ job->exit_code=atoi(ptmp+5); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } else if (strcmp(info, "FAILED") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED", (void *) job_id); } else { job->history->failed_polls++; em_mad = job->history->em_mad; if ( job->history->failed_polls == 3 ) { gw_job_print(job, "EM",'E',"Job poll failed (%s), assuming the job is done.\n",info); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } else { now = time(NULL); job->next_poll_time = now + gw_conf.poll_interval*job->history->failed_polls + gw_rand(gw_conf.poll_interval*job->history->failed_polls); gw_job_print(job, "EM",'E',"Job poll failed (%s), will poll again in %d seconds.\n", info, job->next_poll_time - now); free(job_id); } } } else if (strcmp(action, "RECOVER") == 0) { if (strcmp(result, "SUCCESS") == 0) { if (strcmp(info, "PENDING") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING", (void *) job_id); else if (strcmp(info, "SUSPENDED") == 0) gw_am_trigger(&(gw_em.am), "GW_EM_STATE_SUSPENDED", (void *) job_id); else if (strcmp(info, "ACTIVE") == 0) gw_am_trigger(&(gw_em.am),"GW_EM_STATE_ACTIVE", (void *) job_id); else if (strstr(info, "DONE") != NULL) { ptmp = strstr(info,"DONE:"); if ((ptmp != NULL) && (strlen(ptmp+5) > 0))/*No-wrapper mode*/ job->exit_code = atoi(ptmp+5); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } else if (strcmp(info, "FAILED") == 0) { /* Do not retry */ job->history->tries= job->template.number_of_retries; gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED", (void *) job_id); } } else { gw_job_print(job,"EM",'E',"Job recover failed (%s), assuming the job is done.\n", info); gw_log_print("EM",'E',"Recover of job %i failed.\n", *job_id); gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id); } }
void gw_dm_wrapper_failed_cb ( void *_job_id ) { gw_job_t * job; int job_id; time_t total; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_FAILED_CB).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set execution times & state transition */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; switch (job->job_state) { case GW_JOB_STATE_PRE_WRAPPER: /* --------------- Update pre-wrapper stats -------------------- */ job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_pre_wrapper_time(job->history); gw_job_print(job,"DM",'E',"Pre-Wrapper failed:\n"); gw_job_print(job,"DM",'E',"\tTotal time : %i\n", total); break; case GW_JOB_STATE_WRAPPER: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); gw_job_print(job,"DM",'E',"Wrapper failed:\n"); gw_job_print(job,"DM",'E',"\tTotal time : %i\n", total); /* ---------- We do not need to re-schedule this job --------- */ if ( job->reschedule == GW_TRUE ) { job->reschedule = GW_FALSE; gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id); } break; default: gw_log_print("DM",'E',"Wrapper failed callback in wrong job (%i) state.\n", job_id); break; } /* ----------------------------------------------------------- */ /* 1.- State transtition */ /* ----------------------------------------------------------- */ /* -------------- Free used slot from this host -------------- */ if (job->history != NULL) { job->history->reason = GW_REASON_EXECUTION_ERROR; gw_host_dec_uslots(job->history->host); } /* ----------------------------------------------------------- */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_FAIL", _job_id); pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_wrapper_done_cb ( void *_job_id ) { gw_job_t * job; int job_id; time_t total; time_t active; time_t suspension; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_DONE_CB).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set execution times & state transition */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; switch (job->job_state) { case GW_JOB_STATE_PRE_WRAPPER: /* --------------- Update pre-wrapper stats -------------------- */ job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_pre_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Pre-Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Transition to Wrapper state ------------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_WRAPPER", _job_id); break; case GW_JOB_STATE_WRAPPER: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper DONE:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host); /* ---------- We do not need to re-schedule this job --------- */ if ( job->reschedule == GW_TRUE ) { job->reschedule = GW_FALSE; gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id); } /* -------------- Transition to Epilog state ------------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_STD", _job_id); break; case GW_JOB_STATE_STOP_CANCEL: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host); /* ------------ Transition to Stop Epilog state --------------- */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_STOP_EPILOG", _job_id); break; case GW_JOB_STATE_KILL_CANCEL: /* ----------------- Update wrapper stats ---------------------- */ job->history->stats[WRAPPER_EXIT_TIME] = time(NULL); total = gw_job_history_get_wrapper_time(job->history); active = job->history->stats[ACTIVE_TIME]; suspension = job->history->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); gw_job_print(job,"DM",'I',"\tTotal time : %i\n", total); /* -------------- Free used slot from this host -------------- */ gw_host_dec_uslots(job->history->host); /* ------------ Transition to Kill Epilog state ---------------- */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_KILL_EPILOG", _job_id); break; case GW_JOB_STATE_MIGR_CANCEL: /* ----------- Update previous wrapper stats ------------------- */ job->history->next->stats[WRAPPER_EXIT_TIME] = time(NULL); active = job->history->next->stats[ACTIVE_TIME]; suspension = job->history->next->stats[SUSPENSION_TIME]; gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n"); gw_job_print(job,"DM",'I',"\tActive time : %i\n", active); gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension); /* -------------- Free used slot from previous host ------------ */ gw_host_dec_uslots(job->history->next->host); /* ---------- Transition to Migration Prolog state ------------ */ gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_MIGR_PROLOG", _job_id); break; default: gw_log_print("DM",'E',"Wrapper done callback in wrong job (%i) state.\n", job_id); free(_job_id); break; } pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_pending( void *_job_id ) { gw_job_t *job; int job_id; gw_boolean_t failed; gw_migration_reason_t reason; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_PENDING).\n",job_id); free(_job_id); return; } } else return; /* ----------------------------------------------------------- */ /* 1.- Set state */ /* ----------------------------------------------------------- */ gw_job_print(job,"DM",'I',"Rescheduling job.\n"); gw_log_print("DM",'I',"Rescheduling job %d.\n", job->id); gw_job_set_state (job, GW_JOB_STATE_PENDING, GW_FALSE); /* -------- Update Host & User running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); gw_host_dec_rjobs(job->history->host); /* ------------- Restart counter --------------- */ job->restarted++; /* ------------- Notify the Scheduler --------------- */ if (job->history != NULL) { reason = GW_REASON_NONE; failed = (job->history->reason == GW_REASON_EXECUTION_ERROR) || (job->history->reason == GW_REASON_PERFORMANCE); if (failed) { gw_dm_mad_job_failed(&gw_dm.dm_mad[0], job->history->host->host_id, job->user_id, job->history->reason); } gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], job_id, job->array_id, job->user_id, reason); } else gw_log_print("DM",'E',"Rescheduling job %d, but no history records found.\n", job->id); /* ------------------------------------------------- */ free(_job_id); pthread_mutex_unlock(&(job->mutex)); }
void gw_dm_zombie ( void *_job_id ) { gw_job_t * job; gw_array_t * array; int job_id; int task_id; int array_id; int rt; char conf_filename[2048]; time_t prolog, epilog; /* ----------------------------------------------------------- */ /* 0.- Get job pointer */ /* ----------------------------------------------------------- */ if ( _job_id == NULL ) return; job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_ZOMBIE).\n",job_id); free(_job_id); return; } /* ----------------------------------------------------------- */ /* 0.- Update Job state */ /* ----------------------------------------------------------- */ switch (job->job_state) { case GW_JOB_STATE_EPILOG: gw_job_set_state(job, GW_JOB_STATE_ZOMBIE, GW_FALSE); gw_log_print("DM",'I',"Job %i done, with exit code %i.\n",job->id, job->exit_code); job->history->reason = GW_REASON_NONE; job->exit_time = time(NULL); /* ------------- Print job history and send usage ------------ */ gw_job_print(job,"DM",'I',"Job done, history:\n"); gw_job_print_history(job); gw_job_send_usage(job); if ( job->client_waiting > 0 ) gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id); else { if (gw_conf.dispose == GW_TRUE) gw_am_trigger(&(gw_dm.am), "GW_DM_KILL", _job_id); else free(_job_id); } /* -------- Update User & Host running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); gw_host_dec_rjobs(job->history->host); /* -------- Notify the scheduler -------- */ prolog = gw_job_history_get_prolog_time(job->history); epilog = gw_job_history_get_epilog_time(job->history); gw_dm_mad_job_success(&gw_dm.dm_mad[0], job->history->host->host_id, job->user_id, (prolog + epilog), job->history->stats[SUSPENSION_TIME], job->history->stats[ACTIVE_TIME]); pthread_mutex_unlock(&(job->mutex)); /* -------- Update other jobs dependencies -------- */ gw_job_pool_dep_check(job_id); break; case GW_JOB_STATE_KILL_EPILOG: gw_job_set_state(job, GW_JOB_STATE_ZOMBIE, GW_FALSE); job->exit_time = time(NULL); /* ------------- Print job history and send usage ------------ */ gw_job_print(job,"DM",'I',"Job killed, history:\n"); gw_job_print_history(job); gw_job_send_usage(job); /* ---------------- Free job & Notify RM ---------------- */ array_id = job->array_id; task_id = job->task_id; /* -------- Update User & Host running jobs -------- */ gw_user_pool_dec_running_jobs(job->user_id); gw_host_dec_rjobs(job->history->host); sprintf(conf_filename, "%s/job.conf", job->directory); unlink(conf_filename); pthread_mutex_unlock(&(job->mutex)); /* ------------------------------------------------- */ gw_job_pool_free(job_id); gw_log_print("DM",'I',"Job %i killed and freed.\n", job_id); if (array_id != -1) { array = gw_array_pool_get_array(array_id,GW_TRUE); if ( array != NULL ) { rt = gw_array_del_task(array,task_id); pthread_mutex_unlock(&(array->mutex)); if (rt == 0) { gw_array_pool_array_free(array_id); gw_log_print("DM",'I',"Array %i freed\n",array_id); } } else gw_log_print("DM",'E',"Could not delete task %i from array %i.\n", task_id, array_id); } gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_SUCCESS", _job_id); break; default: gw_log_print("DM",'E',"Zombie callback in wrong job (%i) state.\n", job_id); free(_job_id); pthread_mutex_unlock(&(job->mutex)); break; } }
void gw_em_submit(void *_job_id) { int job_id; gw_job_t *job; char *rsl=NULL; char *contact; gw_job_state_t state; char rsl_filename[2048]; FILE *fd; time_t now; /* ----------------------------------------------------------- */ /* 0.- Get job pointer, check if it exits and lock mutex */ /* ----------------------------------------------------------- */ if ( _job_id != NULL ) { job_id = *( (int *) _job_id ); job = gw_job_pool_get(job_id, GW_TRUE); if ( job == NULL ) { gw_log_print("EM",'E',"Job %s no longer exists (PENDING).\n", job_id); return; } } else return; if (job->history == NULL) { gw_log_print("EM",'E',"History of job %s doesn't exists\n", job_id); free(_job_id); pthread_mutex_unlock(&(job->mutex)); return; } state = job->job_state; /* ----------------------------------------------------------- */ /* 1.- Get execution MAD for this host */ /* ----------------------------------------------------------- */ job->em_state = GW_EM_STATE_INIT; job->history->counter = -1; if ( job->job_state == GW_JOB_STATE_PRE_WRAPPER ) { contact = job->history->em_fork_rc; rsl = (char *) job->history->em_mad->pre_wrapper_rsl((void *) job); } else { contact = job->history->em_rc; rsl = (char *) job->history->em_mad->wrapper_rsl((void *) job); } if ( rsl == NULL ) { job->em_state = GW_EM_STATE_FAILED; gw_log_print("EM",'E',"Job %i failed, could not generate RSL.\n", job_id); gw_am_trigger(gw_em.dm_am, "GW_DM_WRAPPER_FAILED", _job_id); pthread_mutex_unlock(&(job->mutex)); return; } sprintf(rsl_filename, "%s/job.rsl.%i", job->directory,job->restarted); fd = fopen(rsl_filename,"w"); if (fd != NULL ) { gw_job_print(job,"EM",'I',"Submitting wrapper to %s, RSL used is in %s.\n",contact,rsl_filename); fprintf(fd,"%s",rsl); fclose(fd); } else { job->em_state = GW_EM_STATE_FAILED; gw_log_print("EM",'E',"Job %i failed, could not open RSL file.\n", job_id); gw_job_print(job,"EM",'E',"Job failed, could not open RSL file %s.\n",rsl_filename); gw_am_trigger(gw_em.dm_am, "GW_DM_WRAPPER_FAILED", _job_id); pthread_mutex_unlock(&(job->mutex)); return; } /* -------------------------------------------------------------------- */ now = time(NULL); job->next_poll_time = now + gw_conf.poll_interval/2 + gw_rand(gw_conf.poll_interval); /* randomize polls */ gw_job_print(job,"EM",'I',"Job will be polled in %d seconds.\n", job->next_poll_time-now); job->last_checkpoint_time = 0; job->history->stats[LAST_SUSPENSION_TIME] = now; job->history->stats[SUSPENSION_TIME] = 0; job->history->stats[ACTIVE_TIME] = 0; job->history->tries++; /* -------------------------------------------------------------------- */ pthread_mutex_unlock(&(job->mutex)); gw_em_mad_submit(job->history->em_mad, job_id, contact, rsl_filename); /* -------------------------------------------------------------------- */ free(_job_id); free(rsl); }