Exemple #1
0
void gw_dm_failed ( void *_job_id )
{
    gw_job_t * job;
    int        job_id;
    
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_FAILED).\n",job_id);

			free(_job_id);
			return;
		}
	}
	else
		return;

    /* ----------------------------------------------------------- */  
    /* 1.- Set state                                               */
    /* ----------------------------------------------------------- */  

   	gw_log_print("DM",'I',"Job %i failed.\n",job->id);
   	                    
    gw_job_set_state(job, GW_JOB_STATE_FAILED, GW_FALSE);
    gw_job_print(job,"DM",'I',"Job failed, history:\n");
        
    gw_job_print_history(job);
    
    job->exit_time = time(NULL);

    if (job->history != NULL)
	    job->history->reason = GW_REASON_EXECUTION_ERROR;

                    
    if ( job->client_waiting > 0 )
       	gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id);
    else
       	free(_job_id);

	/* -------- Update Host & User running jobs -------- */       	
    
    gw_user_pool_dec_running_jobs(job->user_id);

   	pthread_mutex_lock(&(job->history->host->mutex));

	job->history->host->running_jobs--;

	pthread_mutex_unlock(&(job->history->host->mutex));            
       	    				    
    pthread_mutex_unlock(&(job->mutex));
}
Exemple #2
0
void gw_em_cancel(void *_job_id)
{
    int           job_id;
    gw_job_t      *job;
    gw_em_mad_t   *mad;
    gw_em_state_t current_em_state;

	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );
		free(_job_id);
		
		job = gw_job_pool_get(job_id, GW_TRUE);
	
		if ( job == NULL )
		{
			gw_log_print("EM",'E',"Job %s no longer exists (CANCEL).\n", job_id);
			return;
		}
	}
	else
		return;
    
    /* -------------------------------------------------------------------- */
            
    current_em_state = job->em_state;
        
    if ( issubmitted(current_em_state) )
    {
        gw_job_print (job,"EM",'I',"Cancelling job.\n");
        
        mad = job->history->em_mad;

        /* Warning! When in Migration Cancel, the previous MAD should be used */
        if (job->job_state == GW_JOB_STATE_MIGR_CANCEL)
        {
            if (job->history->next == NULL) 
            {
                gw_log_print("EM",'E',"Previous history record of job %i no longer exists\n", job_id);
				pthread_mutex_unlock(&(job->mutex));                        
                return;
            } 
            else
                mad = job->history->next->em_mad;
        }
        
        gw_em_mad_cancel(mad, job_id);
        
        gw_log_print ("EM",'I',"Cancelling job %i.\n", job_id);

    }
    else
        gw_log_print ("EM",'W',"Ignoring cancel request for job %i, will re-try.\n",
                job_id);    

    /* -------------------------------------------------------------------- */
            
    pthread_mutex_unlock(&(job->mutex));        
}
Exemple #3
0
int gw_tm_prolog_build_urls(gw_job_t *   job,
                            char *       src,
                            char *       dst,
                            char **      src_url,
                            char **      dst_url)
{
    int   is_gsiftp;
    char  url_buffer[1024];
    char  *tmp;

    if ( src[0] == '/' )
    {
        gw_job_print(job,"TM",'W',"\tSkipping file %s, absolute path.\n",src);

        return 1;
    }

    is_gsiftp = strstr(src,"gsiftp://") != NULL;

    if ( is_gsiftp )
    {
        *src_url = gw_job_substitute (src,job);
        if (*src_url == NULL )
        {
            gw_job_print(job,"TM",'E',"\tSkipping file %s, parse error.\n",src);
            return -1;
        }

        if ( dst == NULL )
        {
            tmp = strrchr(src,'/');
            if ( tmp == NULL )
            {
                gw_job_print(job,"TM",'E',"\tSkipping file %s, no file in url.\n",src);
                free(*src_url);
                return -1;
            }
            else
                tmp++;
        }
        else
            tmp = dst;

        snprintf(url_buffer,sizeof(char)*1024,"%s%s",job->history->rdir,tmp);

        *dst_url = gw_job_substitute (url_buffer, job);
        if ( dst_url == NULL )
        {
            gw_job_print(job,"TM",'E',"\tSkipping file %s, parse error.\n",url_buffer);

            free(*src_url);
            return -1;
        }

    }
    else
    {
        if ( strstr(src,"file://") != NULL )
        {
            strncpy(url_buffer, src, sizeof(char) * 1024);

            if ( dst == NULL )
            {
                tmp = strrchr(src,'/');

                if ( tmp == NULL )
                {
                    gw_job_print(job,"TM",'E',"\tSkipping file %s, no file in url.\n",src);
                    return -1;
                }
                else
                    tmp++;
            }
            else
            {
                tmp = dst;
            }
        }
        else
        {
            snprintf(url_buffer,sizeof(char)*1024,"file://%s/%s",job->template.job_home, src);

            if ( dst == NULL )
                tmp = src;
            else
                tmp = dst;
        }
void gw_dm_wrapper_done_cb ( void *_job_id )
{
	gw_job_t *     job;
    int            job_id;
    time_t         total;
    time_t         active;
    time_t         suspension;

	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_DONE_CB).\n",job_id);
			
			free(_job_id);
			return;
		}
	}
	else
		return;
    
    /* ----------------------------------------------------------- */  
    /* 1.- Set execution times & state transition                  */
    /* ----------------------------------------------------------- */  
    
    job->em_state = GW_EM_STATE_INIT;
    
    switch (job->job_state)
    {
    	case GW_JOB_STATE_PRE_WRAPPER:
    		
    		/* --------------- Update pre-wrapper stats -------------------- */
    	
		    job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_pre_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Pre-Wrapper DONE:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);

    		/* -------------- Transition to Wrapper state ------------------ */
    				    
		    gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_WRAPPER", _job_id);		    
    		break;
    		
    	case GW_JOB_STATE_WRAPPER:
    	
    		/* ----------------- Update wrapper stats ---------------------- */
    		
		    job->history->stats[WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Wrapper DONE:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);

    		/* -------------- Free used slot from this host -------------- */

            gw_host_dec_uslots(job->history->host, job->template.np);
void gw_em_listener(void *arg)
{
    fd_set in_pipes;
    int i,j;
    int *job_id;
    int greater, rc, rcm;
    char c;

    char info[GW_EM_MAX_INFO];
    char s_job_id[GW_EM_MAX_JOB_ID];
    char result[GW_EM_MAX_RESULT];
    char action[GW_EM_MAX_ACTION];
    char str[GW_EM_MAX_STRING];

    int fd;
    gw_job_t *job;
    time_t now;

    char contact_file[PATH_MAX];
    FILE *file;

    gw_em_mad_t *em_mad;

    char *ptmp;

    int *fds;
    int num_fds;
    gw_em_mad_t **em_mads;

    pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
    pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

    fds = (int *) malloc(sizeof(int)*gw_conf.number_of_users*GW_MAX_MADS);

    em_mads = (gw_em_mad_t **) malloc(sizeof(gw_em_mad_t *) *
                                      gw_conf.number_of_users * GW_MAX_MADS);
    while (1)
    {
        greater = gw_user_pool_set_em_pipes(&in_pipes, fds, &num_fds, em_mads, gw_em.um_em_pipe_r);

        rc = select( greater+1, &in_pipes, NULL, NULL, NULL);

        if ( rc <= 0 )
            continue;

        for (i=0; i<num_fds; i++)
        {
            fd = fds[i];

            if ( FD_ISSET(fd, &in_pipes) )
            {
                if ( fd == gw_em.um_em_pipe_r )
                {
                    rc = read(fd, (void *) &c, sizeof(char));
#ifdef GWEMDEBUG
                    gw_log_print("EM",'D',"Updating MAD pipes (action is %c)\n",c);
#endif
                    continue;
                }

#ifdef GWEMDEBUG
                gw_log_print("EM",'D',"Reading from MAD pipe %i.\n",i);
#endif

                j = 0;

                do
                {
                    rc = read(fd, (void *) &c, sizeof(char));
                    str[j++] = c;
                }
                while ((rc > 0) && (c != '\n') && (j < (GW_EM_MAX_STRING-1)));

                str[j] = '\0';

                if (rc <= 0)
                {
                    gw_log_print("EM",'W',"Error reading MAD (%s) message\n",
                                 em_mads[i]->name);

                    rcm = gw_em_mad_reload(em_mads[i]);

                    if ( rcm == 0 )
                    {
                        gw_log_print("EM",'I',"MAD (%s) successfully reloaded\n",
                                     em_mads[i]->name);

                        gw_job_pool_em_recover(em_mads[i], &(gw_em.am));
                    }
                    else
                    {
                        gw_log_print("EM",'E',"Error reloading MAD (%s)\n",
                                     em_mads[i]->name);

                        em_mads[i]->mad_em_pipe = -1;
                    }
                    continue;
                }

                sscanf(str,"%" GW2STR(GW_EM_MAX_ACTION) "s %"
                       GW2STR(GW_EM_MAX_JOB_ID) "s %"
                       GW2STR(GW_EM_MAX_RESULT) "s %"
                       GW2STR(GW_EM_MAX_INFO) "[^\n]",
                       action, s_job_id, result, info);

#ifdef GWEMDEBUG
                gw_log_print("EM",'D',"MAD message received:\"%s %s %s %s\".\n",
                             action, s_job_id, result, info);
#endif
                if (s_job_id[0] == '-')
                    continue;

                job_id = (int *) malloc (sizeof(int));

                *job_id = atoi(s_job_id);

                job = gw_job_pool_get(*job_id, GW_TRUE);

                if (job == NULL)
                {
                    gw_log_print("EM",'W',"MAD message for job %s, but it does not exist: \"%s %s %s %s\".\n",
                                 s_job_id,action, s_job_id, result, info);
                    free(job_id);

                    continue;
                }
                if (job->job_state != GW_JOB_STATE_PRE_WRAPPER
                        && job->job_state != GW_JOB_STATE_WRAPPER
                        && job->job_state != GW_JOB_STATE_MIGR_CANCEL
                        && job->job_state != GW_JOB_STATE_STOP_CANCEL
                        && job->job_state != GW_JOB_STATE_KILL_CANCEL)
                {
                    gw_log_print("EM",'W',"MAD message for job %i but not in an execution state.\n",
                                 *job_id);

                    free(job_id);
                    pthread_mutex_unlock(&(job->mutex));
                    continue;
                }
                else if ( job->em_state == GW_EM_STATE_HARD_KILL )
                {
                    gw_log_print("EM",'W',"MAD message for job %i but it is being killed (hard).\n",
                                 *job_id);

                    free(job_id);
                    pthread_mutex_unlock(&(job->mutex));
                    continue;
                }

                if (strcmp(action, "SUBMIT") == 0)
                {
                    if (strcmp(result, "FAILURE") == 0)
                    {
                        gw_job_print(job, "EM",'E',"Job submission failed: %s\n",
                                     info);
                        gw_log_print("EM",'E',"Submission of job %d failed: %s.\n",
                                     job->id, info);

                        gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED",
                                      (void *) job_id);
                    }
                    else /* Save persistent job contact */
                    {
                        snprintf(contact_file, PATH_MAX-1,
                                 GW_VAR_DIR "/%i/job.contact",
                                 job->id);

                        file = fopen(contact_file, "w");

                        if ( file != NULL )
                        {
                            fprintf(file, "%s\n", info);
                            fclose(file);
                        }

                        gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING",
                                      (void *) job_id);
                    }
                }
                else if (strcmp(action, "CANCEL") == 0)
                {
                    if (strcmp(result, "SUCCESS") == 0)
                    {
                        gw_job_print(job, "EM",'I',"Job cancel succeeded.\n");
                        gw_log_print("EM",'I',"Cancel of job %i succeeded.\n", *job_id);
                    }
                    else
                    {
                        gw_job_print(job, "EM",'E',"Job cancel failed (%s).\n",info);
                        gw_log_print("EM",'E',"Cancel of job %d failed: %s.\n",job->id, info);
                    }
                }
                else if (strcmp(action, "POLL") == 0)
                {
                    if (strcmp(result, "SUCCESS") == 0)
                    {
                        now = time(NULL);
                        job->next_poll_time = now + gw_conf.poll_interval/2
                                              + gw_rand(gw_conf.poll_interval);

                        gw_job_print(job, "EM",'E',"Job poll OK (%s), will poll again in %d seconds.\n",
                                     info, job->next_poll_time - now);

                        if (strcmp(info, "PENDING") == 0)
                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING",
                                          (void *) job_id);

                        else if (strcmp(info, "SUSPENDED") == 0)
                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_SUSPENDED",
                                          (void *) job_id);

                        else if (strcmp(info, "ACTIVE") == 0)
                            gw_am_trigger(&(gw_em.am),"GW_EM_STATE_ACTIVE",
                                          (void *) job_id);

                        else if (strstr(info, "DONE") != NULL)
                        {
                            ptmp = strstr(info,"DONE:");

                            if ((ptmp != NULL) && (strlen(ptmp+5) > 0))/*No-wrapper mode*/
                                job->exit_code=atoi(ptmp+5);

                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE",
                                          (void *) job_id);
                        }
                        else if (strcmp(info, "FAILED") == 0)
                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED",
                                          (void *) job_id);
                    }
                    else
                    {
                        job->history->failed_polls++;

                        em_mad = job->history->em_mad;

                        if ( job->history->failed_polls == 3 )
                        {
                            gw_job_print(job, "EM",'E',"Job poll failed (%s), assuming the job is done.\n",info);

                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id);
                        }
                        else
                        {
                            now = time(NULL);
                            job->next_poll_time = now + gw_conf.poll_interval*job->history->failed_polls
                                                  + gw_rand(gw_conf.poll_interval*job->history->failed_polls);

                            gw_job_print(job, "EM",'E',"Job poll failed (%s), will poll again in %d seconds.\n",
                                         info, job->next_poll_time - now);

                            free(job_id);
                        }
                    }
                }
                else if (strcmp(action, "RECOVER") == 0)
                {
                    if (strcmp(result, "SUCCESS") == 0)
                    {
                        if (strcmp(info, "PENDING") == 0)
                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING",
                                          (void *) job_id);
                        else if (strcmp(info, "SUSPENDED") == 0)
                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_SUSPENDED",
                                          (void *) job_id);
                        else if (strcmp(info, "ACTIVE") == 0)
                            gw_am_trigger(&(gw_em.am),"GW_EM_STATE_ACTIVE",
                                          (void *) job_id);
                        else if (strstr(info, "DONE") != NULL)
                        {
                            ptmp = strstr(info,"DONE:");

                            if ((ptmp != NULL) && (strlen(ptmp+5) > 0))/*No-wrapper mode*/
                                job->exit_code = atoi(ptmp+5);

                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE",
                                          (void *) job_id);
                        }
                        else if (strcmp(info, "FAILED") == 0)
                        {
                            /* Do not retry */
                            job->history->tries= job->template.number_of_retries;

                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED",
                                          (void *) job_id);
                        }
                    }
                    else
                    {
                        gw_job_print(job,"EM",'E',"Job recover failed (%s), assuming the job is done.\n", info);
                        gw_log_print("EM",'E',"Recover of job %i failed.\n", *job_id);

                        gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id);
                    }
                }
void gw_dm_wrapper_failed_cb ( void *_job_id )
{
	gw_job_t * job;
    int        job_id;
    time_t     total;
	
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_FAILED_CB).\n",job_id);

			free(_job_id);
			return;
		}
	}
	else
		return;

    /* ----------------------------------------------------------- */  
    /* 1.- Set execution times & state transition                  */
    /* ----------------------------------------------------------- */  
   
   job->em_state = GW_EM_STATE_INIT;
    
    switch (job->job_state)
    {
    	case GW_JOB_STATE_PRE_WRAPPER:

    		/* --------------- Update pre-wrapper stats -------------------- */
    	
		    job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL);
		    total = gw_job_history_get_pre_wrapper_time(job->history);

		    gw_job_print(job,"DM",'E',"Pre-Wrapper failed:\n");
		    gw_job_print(job,"DM",'E',"\tTotal time      : %i\n", total);
    		break;
    		
    	case GW_JOB_STATE_WRAPPER:
    	
    		/* ----------------- Update wrapper stats ---------------------- */
    		
		    job->history->stats[WRAPPER_EXIT_TIME] = time(NULL);		    
		    total = gw_job_history_get_wrapper_time(job->history);

		    gw_job_print(job,"DM",'E',"Wrapper failed:\n");
		    gw_job_print(job,"DM",'E',"\tTotal time      : %i\n", total);		    
		    
    		/* ---------- We do not need to re-schedule this job --------- */
    				    
			if ( job->reschedule == GW_TRUE )
			{
			    job->reschedule = GW_FALSE;
			    gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id);				
			}
					    		   
    		break;
  	
    	default:
			gw_log_print("DM",'E',"Wrapper failed callback in wrong job (%i) state.\n", job_id);
    		break;    	
    }
    

    /* ----------------------------------------------------------- */  
    /* 1.- State transtition                                       */
    /* ----------------------------------------------------------- */  
    
	/* -------------- Free used slot from this host -------------- */
	
	if (job->history != NULL)
	{
		job->history->reason = GW_REASON_EXECUTION_ERROR;
        gw_host_dec_uslots(job->history->host);
	}		
   	
   	/* ----------------------------------------------------------- */

	gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_FAIL", _job_id);

    pthread_mutex_unlock(&(job->mutex));
}
void gw_dm_wrapper_done_cb ( void *_job_id )
{
	gw_job_t *     job;
    int            job_id;
    time_t         total;
    time_t         active;
    time_t         suspension;

	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_DONE_CB).\n",job_id);
			
			free(_job_id);
			return;
		}
	}
	else
		return;
    
    /* ----------------------------------------------------------- */  
    /* 1.- Set execution times & state transition                  */
    /* ----------------------------------------------------------- */  
    
    job->em_state = GW_EM_STATE_INIT;
    
    switch (job->job_state)
    {
    	case GW_JOB_STATE_PRE_WRAPPER:
    		
    		/* --------------- Update pre-wrapper stats -------------------- */
    	
		    job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_pre_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Pre-Wrapper DONE:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);

    		/* -------------- Transition to Wrapper state ------------------ */
    				    
		    gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_WRAPPER", _job_id);		    
    		break;
    		
    	case GW_JOB_STATE_WRAPPER:
    	
    		/* ----------------- Update wrapper stats ---------------------- */
    		
		    job->history->stats[WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Wrapper DONE:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);

    		/* -------------- Free used slot from this host -------------- */

            gw_host_dec_uslots(job->history->host);
            
    		/* ---------- We do not need to re-schedule this job --------- */
    				    
			if ( job->reschedule == GW_TRUE )
			{
			    job->reschedule = GW_FALSE;
			    gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id);				
			}
			            		                
    		/* -------------- Transition to Epilog state ------------------ */
    				    
			gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_STD", _job_id);			
    		break;
    	
    	case GW_JOB_STATE_STOP_CANCEL:
    	
    		/* ----------------- Update wrapper stats ---------------------- */    	
    		
		    job->history->stats[WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);

    		/* -------------- Free used slot from this host -------------- */
    		
    		gw_host_dec_uslots(job->history->host);
			    		
    		/* ------------ Transition to Stop Epilog state --------------- */
    		
			gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_STOP_EPILOG", _job_id);		    
    		break;
    	
    	case GW_JOB_STATE_KILL_CANCEL:

    		/* ----------------- Update wrapper stats ---------------------- */
    		    	    	    	
		    job->history->stats[WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);    	

    		/* -------------- Free used slot from this host -------------- */

            gw_host_dec_uslots(job->history->host);
			            
    		/* ------------ Transition to Kill Epilog state ---------------- */
    		
			gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_KILL_EPILOG", _job_id);		    
    		break;
    	
    	case GW_JOB_STATE_MIGR_CANCEL:

    		/* ----------- Update previous wrapper stats ------------------- */
    		    	
            job->history->next->stats[WRAPPER_EXIT_TIME] = time(NULL);
            
            active     = job->history->next->stats[ACTIVE_TIME];
            suspension = job->history->next->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);

    		/* -------------- Free used slot from previous host ------------ */
    		
            gw_host_dec_uslots(job->history->next->host);
	    			    
    		/* ---------- Transition to Migration Prolog state ------------ */
    		
			gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_MIGR_PROLOG", _job_id);		    
    		break;

    	default:
			gw_log_print("DM",'E',"Wrapper done callback in wrong job (%i) state.\n", job_id);
			
			free(_job_id);    	    	
    		break;    	
    }
    
    pthread_mutex_unlock(&(job->mutex));    
}
void gw_dm_pending( void *_job_id )
{
    gw_job_t *job;
    int job_id;
    gw_boolean_t failed;
    gw_migration_reason_t reason;
    
    /* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
    if ( _job_id != NULL )
    {
        job_id = *( (int *) _job_id );

        job = gw_job_pool_get(job_id, GW_TRUE);

        if ( job == NULL )
        {
            gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_PENDING).\n",job_id);

            free(_job_id);
            return;
        }
    }
    else
        return;

    /* ----------------------------------------------------------- */  
    /* 1.- Set state                                               */
    /* ----------------------------------------------------------- */  

    gw_job_print(job,"DM",'I',"Rescheduling job.\n");
    gw_log_print("DM",'I',"Rescheduling job %d.\n", job->id);

    gw_job_set_state (job, GW_JOB_STATE_PENDING, GW_FALSE);
    
    /* -------- Update Host & User running jobs -------- */
			            
    gw_user_pool_dec_running_jobs(job->user_id);
    
    gw_host_dec_rjobs(job->history->host);
    
    /* ------------- Restart counter --------------- */
    
    job->restarted++;

    /* ------------- Notify the Scheduler --------------- */
   
    if (job->history != NULL)
    {	
        reason = GW_REASON_NONE;
		    
        failed = (job->history->reason == GW_REASON_EXECUTION_ERROR)
                || (job->history->reason == GW_REASON_PERFORMANCE);
 
        if (failed)
        {
            gw_dm_mad_job_failed(&gw_dm.dm_mad[0], job->history->host->host_id,
	            job->user_id, job->history->reason);
        }
                              	                              
        gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], job_id, job->array_id,
                job->user_id, reason);
    }
    else
        gw_log_print("DM",'E',"Rescheduling job %d, but no history records found.\n", 
                job->id);
                           
    /* ------------------------------------------------- */
				        
    free(_job_id);
		    
    pthread_mutex_unlock(&(job->mutex));
}
Exemple #9
0
void gw_dm_zombie ( void *_job_id )
{
    gw_job_t *   job;
    gw_array_t * array;
    int          job_id;
    int          task_id;
    int          array_id;
    int          rt;
    char         conf_filename[2048];
    time_t       prolog, epilog;

    /* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
    if ( _job_id == NULL )
        return;

    job_id = *( (int *) _job_id );

    job = gw_job_pool_get(job_id, GW_TRUE);

    if ( job == NULL )
    {
        gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_ZOMBIE).\n",job_id);

        free(_job_id);
        return;
    }

    /* ----------------------------------------------------------- */  
    /* 0.- Update Job state                                        */
    /* ----------------------------------------------------------- */  
    
    switch (job->job_state)
    {
        case GW_JOB_STATE_EPILOG:
        
            gw_job_set_state(job, GW_JOB_STATE_ZOMBIE, GW_FALSE);
       
            gw_log_print("DM",'I',"Job %i done, with exit code %i.\n",job->id, job->exit_code);

            job->history->reason = GW_REASON_NONE;
            job->exit_time       = time(NULL);

            /* ------------- Print job history and send usage ------------ */

            gw_job_print(job,"DM",'I',"Job done, history:\n");            
            gw_job_print_history(job);
            gw_job_send_usage(job);
                                
            if ( job->client_waiting > 0 )
                gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id);
            else
            {
                if (gw_conf.dispose == GW_TRUE)
                    gw_am_trigger(&(gw_dm.am), "GW_DM_KILL", _job_id);
                else
                    free(_job_id);
            }

            /* -------- Update User & Host running jobs -------- */
            
            gw_user_pool_dec_running_jobs(job->user_id);
            
            gw_host_dec_rjobs(job->history->host);
            
            /* --------       Notify the scheduler      -------- */
                                  
            prolog = gw_job_history_get_prolog_time(job->history);
            epilog = gw_job_history_get_epilog_time(job->history);

            gw_dm_mad_job_success(&gw_dm.dm_mad[0],
                    job->history->host->host_id,
                    job->user_id,
                    (prolog + epilog),
                    job->history->stats[SUSPENSION_TIME],
                    job->history->stats[ACTIVE_TIME]);
                           
            pthread_mutex_unlock(&(job->mutex));

            /* -------- Update other jobs dependencies -------- */
            gw_job_pool_dep_check(job_id);

            break;
                    
        case GW_JOB_STATE_KILL_EPILOG:
            
            gw_job_set_state(job, GW_JOB_STATE_ZOMBIE, GW_FALSE);            

            job->exit_time = time(NULL);

            /* ------------- Print job history and send usage ------------ */
            
            gw_job_print(job,"DM",'I',"Job killed, history:\n");
            gw_job_print_history(job);
            gw_job_send_usage(job);

            /* ---------------- Free job & Notify RM ---------------- */
            
            array_id = job->array_id;
            task_id  = job->task_id;            

            /* -------- Update User & Host running jobs -------- */
           
            gw_user_pool_dec_running_jobs(job->user_id);

            gw_host_dec_rjobs(job->history->host);
            
            sprintf(conf_filename, "%s/job.conf", job->directory);
            unlink(conf_filename);    

            pthread_mutex_unlock(&(job->mutex));

            /* ------------------------------------------------- */            
            
            gw_job_pool_free(job_id);
            
            gw_log_print("DM",'I',"Job %i killed and freed.\n", job_id);        

            if (array_id != -1)
            {
                array = gw_array_pool_get_array(array_id,GW_TRUE);
            
                if ( array != NULL )
                {                        
                    rt = gw_array_del_task(array,task_id);
                    pthread_mutex_unlock(&(array->mutex));
                    if (rt == 0)
                    {
                        gw_array_pool_array_free(array_id);
                        gw_log_print("DM",'I',"Array %i freed\n",array_id);
                    }
                }
                else
                    gw_log_print("DM",'E',"Could not delete task %i from array %i.\n",
                            task_id, array_id);
            }
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_SUCCESS", _job_id);
            break;

        default:
            gw_log_print("DM",'E',"Zombie callback in wrong job (%i) state.\n", job_id);

            free(_job_id);
            
            pthread_mutex_unlock(&(job->mutex));
            break;
    }
}
Exemple #10
0
void gw_em_submit(void *_job_id)
{
    int           job_id;
    gw_job_t      *job;
    char          *rsl=NULL;
    char          *contact;
    gw_job_state_t state;
    char          rsl_filename[2048];
    FILE          *fd;
    time_t        now;
	
    /* ----------------------------------------------------------- */  
    /* 0.- Get job pointer, check if it exits and lock mutex       */
    /* ----------------------------------------------------------- */  

    if ( _job_id != NULL )
    {
        job_id = *( (int *) _job_id );	
        job = gw_job_pool_get(job_id, GW_TRUE);
	
        if ( job == NULL )
        {
            gw_log_print("EM",'E',"Job %s no longer exists (PENDING).\n",
                    job_id);
            return;
        }
    }
    else
        return;

    if (job->history == NULL) 
    {
        gw_log_print("EM",'E',"History of job %s doesn't exists\n",
                job_id);
        free(_job_id);
        pthread_mutex_unlock(&(job->mutex));				
        return;
    }

    state = job->job_state;
	
    /* ----------------------------------------------------------- */  
    /* 1.- Get execution MAD for this host                         */
    /* ----------------------------------------------------------- */  

    job->em_state = GW_EM_STATE_INIT;
    job->history->counter = -1;    

    if ( job->job_state == GW_JOB_STATE_PRE_WRAPPER )
    {
        contact = job->history->em_fork_rc;
        rsl     = (char *) job->history->em_mad->pre_wrapper_rsl((void *) job);    
    }
    else
    {
        contact = job->history->em_rc;
        rsl     = (char *) job->history->em_mad->wrapper_rsl((void *) job);    		
    }
	
    if ( rsl == NULL )
    {
        job->em_state = GW_EM_STATE_FAILED;
        
        gw_log_print("EM",'E',"Job %i failed, could not generate RSL.\n", job_id);        
		gw_am_trigger(gw_em.dm_am, "GW_DM_WRAPPER_FAILED", _job_id);
        
        pthread_mutex_unlock(&(job->mutex));
        
        return;
    }
    
    sprintf(rsl_filename, "%s/job.rsl.%i", job->directory,job->restarted);
    
    fd = fopen(rsl_filename,"w");
    if (fd != NULL )
    {
        gw_job_print(job,"EM",'I',"Submitting wrapper to %s, RSL used is in %s.\n",contact,rsl_filename);
    	fprintf(fd,"%s",rsl);
    	fclose(fd);
    }
    else
    {
        job->em_state = GW_EM_STATE_FAILED;
        
        gw_log_print("EM",'E',"Job %i failed, could not open RSL file.\n", job_id);
        gw_job_print(job,"EM",'E',"Job failed, could not open RSL file %s.\n",rsl_filename);
		
        gw_am_trigger(gw_em.dm_am, "GW_DM_WRAPPER_FAILED", _job_id);
        
        pthread_mutex_unlock(&(job->mutex));
        
        return;    	
    }

    /* -------------------------------------------------------------------- */

    now = time(NULL);

    job->next_poll_time = now + gw_conf.poll_interval/2
            + gw_rand(gw_conf.poll_interval);            /* randomize polls */

    gw_job_print(job,"EM",'I',"Job will be polled in %d seconds.\n",
            job->next_poll_time-now);

    job->last_checkpoint_time = 0;
	
    job->history->stats[LAST_SUSPENSION_TIME] = now;
    job->history->stats[SUSPENSION_TIME]      = 0;
    job->history->stats[ACTIVE_TIME]          = 0;

    job->history->tries++;
        
    /* -------------------------------------------------------------------- */
    
    pthread_mutex_unlock(&(job->mutex));
    
    gw_em_mad_submit(job->history->em_mad, job_id, contact, rsl_filename);

    /* -------------------------------------------------------------------- */
    
    free(_job_id);
    
    free(rsl);
}