void gw_dm_hold (void *_job_id)
{
    gw_job_t *   job;
    int          job_id;
    
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (HOLD).\n",job_id);
				
			gw_am_trigger(gw_dm.rm_am,"GW_RM_HOLD_FAILED", _job_id);
			return;
		}
	}
	else
		return;

	/* ----------------------------------------------------------- */  
    /* 1.- Hold the job                                            */
    /* ----------------------------------------------------------- */  

    switch (job->job_state)
    {
		case GW_JOB_STATE_INIT:
		case GW_JOB_STATE_PENDING:
	        
	        gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE);
	        
            gw_log_print("DM",'I',"Job %i held.\n", job_id);        
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_HOLD_SUCCESS", _job_id);
            break;
		
        default:
                                
            gw_log_print("DM",'W',"Job %i can not be held in current state.\n",
                    job_id);
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_HOLD_FAILED", _job_id);            
            break;
    }

	pthread_mutex_unlock(&(job->mutex));	
}
Exemple #2
0
void gw_dm_failed ( void *_job_id )
{
    gw_job_t * job;
    int        job_id;
    
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_FAILED).\n",job_id);

			free(_job_id);
			return;
		}
	}
	else
		return;

    /* ----------------------------------------------------------- */  
    /* 1.- Set state                                               */
    /* ----------------------------------------------------------- */  

   	gw_log_print("DM",'I',"Job %i failed.\n",job->id);
   	                    
    gw_job_set_state(job, GW_JOB_STATE_FAILED, GW_FALSE);
    gw_job_print(job,"DM",'I',"Job failed, history:\n");
        
    gw_job_print_history(job);
    
    job->exit_time = time(NULL);

    if (job->history != NULL)
	    job->history->reason = GW_REASON_EXECUTION_ERROR;

                    
    if ( job->client_waiting > 0 )
       	gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id);
    else
       	free(_job_id);

	/* -------- Update Host & User running jobs -------- */       	
    
    gw_user_pool_dec_running_jobs(job->user_id);

   	pthread_mutex_lock(&(job->history->host->mutex));

	job->history->host->running_jobs--;

	pthread_mutex_unlock(&(job->history->host->mutex));            
       	    				    
    pthread_mutex_unlock(&(job->mutex));
}
Exemple #3
0
void gw_dm_wait (void *_job_id)
{
    gw_job_t *   job;
    int          job_id;
    
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  

	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (WAIT).\n",job_id);			
				
            gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_FAILED", _job_id);
			return;
		}
	}
	else
		return;

	/* ----------------------------------------------------------- */  
    /* 1.- Wait for the job                                        */
    /* ----------------------------------------------------------- */  
	
	job->client_waiting++;
	
    switch (job->job_state)
    {
		case GW_JOB_STATE_ZOMBIE:
		case GW_JOB_STATE_FAILED:
            gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS",  _job_id);
            break;
		
        default:
            free(_job_id);
            break;
    }

	pthread_mutex_unlock(&(job->mutex));	
}
Exemple #4
0
void gw_dm_kill_hard (void *_job_id)
{
    gw_job_t *   job;
    int          job_id;
    int          rt;
    int          array_id;
    int          task_id;
    gw_array_t * array;
	char   		 conf_filename[GW_MSG_STRING_LONG];
	    
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (KILL_HARD).\n",job_id);

            gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED",_job_id);
			return;
		}
	}
	else
		return;
   
	/* ----------------------------------------------------------- */  
    /* 1.- Hard Kill the job                                       */
    /* ----------------------------------------------------------- */  
	
    switch (job->job_state)
    {
    	
		case GW_JOB_STATE_MIGR_PROLOG:
		case GW_JOB_STATE_MIGR_EPILOG:				
                        
            gw_host_dec_rjobs(job->history->next->host);
                        			
			job->history->next->stats[EXIT_TIME] = time(NULL);
						    	
    	case GW_JOB_STATE_PROLOG:
            
            gw_host_dec_uslots(job->history->host, job->template.np);
void gw_dm_stopped ( void *_job_id )
{
    gw_job_t * job;
    int        job_id;

    /* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
    if ( _job_id != NULL )
    {
        job_id = *( (int *) _job_id );

        job = gw_job_pool_get(job_id, GW_TRUE);

        if ( job == NULL )
        {
			gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_STOPPED).\n",job_id);

            free(_job_id);
            return;
        }
    }
    else
        return;
        
    /* ----------------------------------------------------------- */  
    /* 1.- Update Job state                                        */
    /* ----------------------------------------------------------- */  
   		
	gw_job_set_state(job, GW_JOB_STATE_STOPPED, GW_FALSE);

	/* -------- Update Host & User running jobs -------- */
			            
    gw_user_pool_dec_running_jobs(job->user_id);

    gw_host_dec_rjobs(job->history->host);
                
    /* ----------------------------------------------------------- */  
    /* 2.- Notify Request Manager                                  */
    /* ----------------------------------------------------------- */
	
	gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_SUCCESS", _job_id);    
        
    pthread_mutex_unlock(&(job->mutex));
}
Exemple #6
0
void gw_dm_migr_cancel ( void *_job_id )
{
    gw_job_t * job;
    int        job_id;
    
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_MGR_CANCEL).\n",job_id);

			free(_job_id);
			return;
		}
	}
	else
		return;
	    
    /* ----------------------------------------------------------- */  
    /* 1.- Check we still need to migrate this job                 */
    /* ----------------------------------------------------------- */  

	if ( (job->job_state == GW_JOB_STATE_WRAPPER)
            && (job->em_state  != GW_EM_STATE_DONE))
	{
		job->history->stats[MIGRATION_START_TIME] = time(NULL);
        gw_job_set_state(job, GW_JOB_STATE_MIGR_CANCEL, GW_FALSE);
        
        gw_am_trigger(gw_dm.em_am, "GW_EM_CANCEL", _job_id);
	}
	else
	{
        gw_log_print("DM",'W',"Can't migrate %i to in current state.\n",job->id);
		free(_job_id);
	}	
	            
    pthread_mutex_unlock(&(job->mutex));
}
void gw_dm_wrapper ( void *_job_id )
{
    gw_job_t * job;
    int        job_id;
    
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER).\n",job_id);

			free(_job_id);
			return;
		}
	}
	else
		return;
    
    /* ----------------------------------------------------------- */  
    /* 1.- Set state and times                                     */
    /* ----------------------------------------------------------- */  

    job->history->stats[WRAPPER_START_TIME] = time(NULL);

 	gw_job_set_state(job, GW_JOB_STATE_WRAPPER, GW_FALSE);
    
    /* ----------------------------------------------------------- */  
    /* 2.- Signal the Execution Manager                            */
    /* ----------------------------------------------------------- */
        
    job->em_state = GW_EM_STATE_INIT;
        
    gw_am_trigger(gw_dm.em_am, "GW_EM_SUBMIT", _job_id);
    
    pthread_mutex_unlock(&(job->mutex));
}
void gw_dm_wrapper_failed_cb ( void *_job_id )
{
	gw_job_t * job;
    int        job_id;
    time_t     total;
	
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_FAILED_CB).\n",job_id);

			free(_job_id);
			return;
		}
	}
	else
		return;

    /* ----------------------------------------------------------- */  
    /* 1.- Set execution times & state transition                  */
    /* ----------------------------------------------------------- */  
   
   job->em_state = GW_EM_STATE_INIT;
    
    switch (job->job_state)
    {
    	case GW_JOB_STATE_PRE_WRAPPER:

    		/* --------------- Update pre-wrapper stats -------------------- */
    	
		    job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL);
		    total = gw_job_history_get_pre_wrapper_time(job->history);

		    gw_job_print(job,"DM",'E',"Pre-Wrapper failed:\n");
		    gw_job_print(job,"DM",'E',"\tTotal time      : %i\n", total);
    		break;
    		
    	case GW_JOB_STATE_WRAPPER:
    	
    		/* ----------------- Update wrapper stats ---------------------- */
    		
		    job->history->stats[WRAPPER_EXIT_TIME] = time(NULL);		    
		    total = gw_job_history_get_wrapper_time(job->history);

		    gw_job_print(job,"DM",'E',"Wrapper failed:\n");
		    gw_job_print(job,"DM",'E',"\tTotal time      : %i\n", total);		    
		    
    		/* ---------- We do not need to re-schedule this job --------- */
    				    
			if ( job->reschedule == GW_TRUE )
			{
			    job->reschedule = GW_FALSE;
			    gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id);				
			}
					    		   
    		break;
  	
    	default:
			gw_log_print("DM",'E',"Wrapper failed callback in wrong job (%i) state.\n", job_id);
    		break;    	
    }
    

    /* ----------------------------------------------------------- */  
    /* 1.- State transtition                                       */
    /* ----------------------------------------------------------- */  
    
	/* -------------- Free used slot from this host -------------- */
	
	if (job->history != NULL)
	{
		job->history->reason = GW_REASON_EXECUTION_ERROR;
        gw_host_dec_uslots(job->history->host);
	}		
   	
   	/* ----------------------------------------------------------- */

	gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_FAIL", _job_id);

    pthread_mutex_unlock(&(job->mutex));
}
Exemple #9
0
void gw_dm_kill (void *_job_id)
{
    gw_job_t *   job;
    int          job_id;
    int          rt;
    int          array_id;
    int          task_id;
    gw_array_t * array;
	char   		 conf_filename[2048];
	    
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (KILL).\n",job_id);

            gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED",  _job_id);
			return;
		}
	}
	else
		return;

	/* ----------------------------------------------------------- */  
    /* 1.- Kill the job                                            */
    /* ----------------------------------------------------------- */  
	
    switch (job->job_state)
    {
		case GW_JOB_STATE_INIT:
		case GW_JOB_STATE_PENDING:
		case GW_JOB_STATE_HOLD:
		case GW_JOB_STATE_STOPPED:
        
            job->exit_time = time(NULL);
		
		case GW_JOB_STATE_FAILED:
		case GW_JOB_STATE_ZOMBIE:
			
			array_id = job->array_id;
			task_id  = job->task_id;

			sprintf(conf_filename, "%s/job.conf", job->directory);	
			unlink(conf_filename);    
			
			pthread_mutex_unlock(&(job->mutex));
            gw_job_pool_free(job_id);
            
            gw_log_print("DM",'I',"Job %i killed and freed.\n", job_id);		

            if (array_id != -1)
            {
              array = gw_array_pool_get_array(array_id, GW_TRUE);
            
              if ( array != NULL )
              {                        
                rt = gw_array_del_task(array,task_id);
                pthread_mutex_unlock(&(array->mutex));
                
                if (rt == 0)
                {
                  gw_array_pool_array_free(array_id);
                  gw_log_print("DM",'I',"Array %i freed.\n",array_id);
                }
              }
              else
                gw_log_print("DM",'E',"Array %i does not exisit (KILL - task %i).\n",
                             array_id, task_id);
            }
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_SUCCESS",  _job_id);
            break;
		
		case GW_JOB_STATE_WRAPPER:

	        if (job->history != NULL )
	            job->history->reason = GW_REASON_KILL;
	            		
            gw_log_print("DM",'I',"Killing job %i.\n", job_id);
            
			gw_job_set_state(job, GW_JOB_STATE_KILL_CANCEL, GW_FALSE);
			
			gw_am_trigger(gw_dm.em_am, "GW_EM_CANCEL", _job_id);
			
            pthread_mutex_unlock(&(job->mutex));
			break;
			
        default:
            
            gw_log_print("DM",'W',"Job %i can not be killed in current state.\n", job_id);
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED",  _job_id);
            
            pthread_mutex_unlock(&(job->mutex));            
            break;
    }
}
Exemple #10
0
void gw_dm_kill_hard (void *_job_id)
{
    gw_job_t *   job;
    int          job_id;
    int          rt;
    int          array_id;
    int          task_id;
    gw_array_t * array;
	char   		 conf_filename[2048];
	    
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (KILL_HARD).\n",job_id);

            gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED",_job_id);
			return;
		}
	}
	else
		return;
   
	/* ----------------------------------------------------------- */  
    /* 1.- Hard Kill the job                                       */
    /* ----------------------------------------------------------- */  
	
    switch (job->job_state)
    {
    	
		case GW_JOB_STATE_MIGR_PROLOG:
		case GW_JOB_STATE_MIGR_EPILOG:		

			pthread_mutex_lock(&(job->history->next->host->mutex));		
			job->history->next->host->running_jobs--;    		
			pthread_mutex_unlock(&(job->history->next->host->mutex));
			
			job->history->next->stats[EXIT_TIME] = time(NULL);
						    	
    	case GW_JOB_STATE_PROLOG:
			
			pthread_mutex_lock(&(job->history->host->mutex));    	
    		job->history->host->used_slots--;    		    			
			pthread_mutex_unlock(&(job->history->host->mutex));
			      
		case GW_JOB_STATE_EPILOG:
		case GW_JOB_STATE_EPILOG_STD:
		case GW_JOB_STATE_EPILOG_RESTART:
		case GW_JOB_STATE_EPILOG_FAIL:
		
			job->history->reason = GW_REASON_KILL;
			
		case GW_JOB_STATE_STOP_EPILOG:
		case GW_JOB_STATE_KILL_EPILOG:
			
			pthread_mutex_lock(&(job->history->host->mutex));    	
			job->history->host->running_jobs--;									
			pthread_mutex_unlock(&(job->history->host->mutex));
						
			job->exit_time = time(NULL);
			job->history->stats[EXIT_TIME] = time(NULL);
						
	    	job->tm_state = GW_TM_STATE_HARD_KILL;

			if (job->history != NULL) 
			{
            	gw_log_print("DM",'I',"Cancelling prolog/epilog transfers of job %i.\n", job_id);
            	
				gw_tm_mad_end(job->history->tm_mad, job->id);
			}				    	
	    break;
    	
		case GW_JOB_STATE_PRE_WRAPPER:
		case GW_JOB_STATE_WRAPPER:

			job->history->reason = GW_REASON_KILL;
			
			pthread_mutex_lock(&(job->history->host->mutex));
			job->history->host->used_slots--;
			job->history->host->running_jobs--;
			pthread_mutex_unlock(&(job->history->host->mutex));
						
			job->exit_time = time(NULL);		
			job->history->stats[EXIT_TIME] = time(NULL);
						
			job->em_state = GW_EM_STATE_HARD_KILL;
		
			if (job->history != NULL) 
			{
            	gw_log_print("DM",'I',"Cancelling execution of job %i.\n", job_id);
            	
				gw_em_mad_cancel(job->history->em_mad, job_id);
			}			
		break;

		case GW_JOB_STATE_MIGR_CANCEL:
		
			pthread_mutex_lock(&(job->history->next->host->mutex));		
			job->history->next->host->used_slots--;
			job->history->next->host->running_jobs--;
			pthread_mutex_unlock(&(job->history->next->host->mutex));		

			job->history->next->stats[EXIT_TIME] = time(NULL);
			
			job->history->reason = GW_REASON_KILL;
			
   		case GW_JOB_STATE_STOP_CANCEL:
		case GW_JOB_STATE_KILL_CANCEL:
						
			pthread_mutex_lock(&(job->history->host->mutex));
			job->history->host->used_slots--;
			job->history->host->running_jobs--;
			pthread_mutex_unlock(&(job->history->host->mutex));
		
			job->exit_time = time(NULL);		
			job->history->stats[EXIT_TIME] = time(NULL);
						
			job->em_state = GW_EM_STATE_HARD_KILL;
		break;
		
		case GW_JOB_STATE_INIT:
		case GW_JOB_STATE_PENDING:
		case GW_JOB_STATE_HOLD:
		case GW_JOB_STATE_STOPPED:
		
	        job->exit_time = time(NULL);
	     break;
            
		
		case GW_JOB_STATE_FAILED:
		case GW_JOB_STATE_ZOMBIE:
		
		break;
			
        default:
            
            gw_log_print("DM",'W',"Job %i can not be killed in current state.\n", job_id);
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_FAILED",  _job_id);
            
            pthread_mutex_unlock(&(job->mutex));            
            
        return;
    }
    
	array_id = job->array_id;
	task_id  = job->task_id;

	sprintf(conf_filename, "%s/job.conf", job->directory);	
	unlink(conf_filename);
							
	pthread_mutex_unlock(&(job->mutex));
    gw_job_pool_free(job_id);
            
    gw_log_print("DM",'I',"Job %i killed (hard) and freed.\n", job_id);		

    if (array_id != -1)
    {
    	array = gw_array_pool_get_array(array_id, GW_TRUE);
            
        if ( array != NULL )
        {                        
        	rt = gw_array_del_task(array,task_id);
            pthread_mutex_unlock(&(array->mutex));
                
            if (rt == 0)
            {
                gw_array_pool_array_free(array_id);
                gw_log_print("DM",'I',"Array %i freed.\n",array_id);
            }
         }
         else
             gw_log_print("DM",'E',"Array %i does not exisit (KILL - task %i).\n",
                          array_id, task_id);
     }
            
     gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_SUCCESS", _job_id);
}
void gw_dm_jalloc(void *_msg)
{
    gw_msg_submit_t * msg_submit;
    gw_job_t *   job;
    int          jid;
    int          uid;
    int          rc;
    
    gw_boolean_t   useradd;    
    gw_job_state_t init_state;

    msg_submit  = (gw_msg_submit_t *) _msg;

    /* ------------- Check if user is already registered ------------ */
	    
    useradd = gw_user_pool_exists (msg_submit->msg.owner, msg_submit->msg.proxy_path, &uid) == GW_FALSE;
	
    if (useradd)
    {
    	rc = gw_user_pool_user_allocate (msg_submit->msg.owner, msg_submit->msg.proxy_path, &uid);
 
#ifdef GWDMDEBUG
        gw_log_print("DM",'D',"User %s registered with UID %i.\n", msg_submit->msg.owner, uid);
#endif

    	if ( rc != 0 )
    	{
            gw_log_print("DM",'E',"Could not register user %s.\n", msg_submit->msg.owner);
        
            msg_submit->msg.rc = GW_RC_FAILED_USER;        
            gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg));        
            return;
    	}
    }

    /* ------------- Allocate job structure ------------ */

    jid = gw_job_pool_allocate();
    
    if ( jid == -1 )
    {
        gw_log_print("DM",'E',"Could not allocate job.\n");
        
        msg_submit->msg.rc = GW_RC_FAILED_NO_MEMORY;
        gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg));        
        return;
    }
    
    /* ------------------------------------------ */
    /*               Update job data              */
    /* ------------------------------------------ */
		
    job = gw_job_pool_get(jid, GW_TRUE);
    
    if ( job == NULL )
    {
       	msg_submit->msg.rc = GW_RC_FAILED_BAD_JOB_ID;
        gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg));        
    	
      	return;
    }  
    
    /* ------ Fill data using the template ------- */
    
    rc = gw_job_fill(job, msg_submit);

    if ( rc == -1 )
    {
        gw_log_print("DM",'E',"Could not initialize job.\n");

        pthread_mutex_unlock(&(job->mutex));
        gw_job_pool_free(jid);
        
        msg_submit->msg.rc = GW_RC_FAILED;
        gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg));
        return;
    }
    
    /* --------- Set the initial state ---------- */
    
    init_state = msg_submit->msg.init_state;
 		 
    if ( init_state == GW_JOB_STATE_PENDING )
        gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE);
    else
        gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE);
	    
    job->tm_state = GW_TM_STATE_INIT;
    job->em_state = GW_EM_STATE_INIT;
    job->user_id  = uid;
    
    /* --------- Set the initial priority ---------- */
	
    job->fixed_priority = msg_submit->msg.fixed_priority;
	
    pthread_mutex_unlock(&(job->mutex));

    /* ------------- Set job dependencies ------------ */ 	   
    
    if ( msg_submit->jt.job_deps[0] != -1 )
    	gw_job_pool_dep_set(jid, msg_submit->jt.job_deps); 	   

    if (!useradd)
        gw_user_pool_inc_jobs(uid,1);
	
    /* ------------- Callback msg ------------ */
	
    msg_submit->msg.rc       = GW_RC_SUCCESS;
    msg_submit->msg.array_id = -1;    
    msg_submit->msg.job_id   = jid;
    
    gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg));
    
    /* ------------- Notify the scheduler ------------ */
    
    if ( init_state == GW_JOB_STATE_PENDING )
        gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], jid, -1, uid,
                GW_REASON_NONE);

    gw_log_print("DM",'I',"New job %i allocated and initialized.\n", jid);
}
void gw_dm_reschedule (void *_job_id)
{
	
	gw_job_t *   job;
    int          job_id;
    
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (RE-SCHEDULE).\n",job_id);

			gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_FAILED", _job_id);
			return;
		}
	}
	else
		return;

	/* ----------------------------------------------------------- */  
    /* 1.- re-schedule job                                         */
    /* ----------------------------------------------------------- */  


    switch (job->job_state)
    {
		case GW_JOB_STATE_WRAPPER:
	        
	        job->reschedule      = GW_TRUE;
            job->history->reason = GW_REASON_USER_REQUESTED;
            	        
            gw_log_print("DM",'I',"Job %i will be re-scheduled.\n", job_id);        
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_SUCCESS",_job_id);
            
            gw_dm_mad_job_schedule(&gw_dm.dm_mad[0],
                                   job_id,
                                   job->array_id,
                                   job->user_id,
                                   GW_REASON_USER_REQUESTED);
            break;
		
		case GW_JOB_STATE_FAILED:
		
            gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE);
            
            job->tm_state = GW_TM_STATE_INIT;
            job->em_state = GW_EM_STATE_INIT;
            
            job->history->reason = GW_REASON_USER_REQUESTED;
            job->restarted++;

            gw_log_print("DM",'I',"Job %i will be re-scheduled.\n", job_id);
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_SUCCESS",  _job_id);
            
            gw_dm_mad_job_schedule(&gw_dm.dm_mad[0],
                                   job_id,
                                   job->array_id,
                                   job->user_id,
                                   GW_REASON_NONE);            
			break;
			
        default:
                                
            gw_log_print("DM",'I',"Job %i can not be re-scheduled in current state.\n", job_id);
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_RESCHEDULE_FAILED", _job_id);            
            break;
    }

	pthread_mutex_unlock(&(job->mutex));	
}	
Exemple #13
0
void gw_dm_wrapper_done_cb ( void *_job_id )
{
	gw_job_t *     job;
    int            job_id;
    time_t         total;
    time_t         active;
    time_t         suspension;

	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_DONE_CB).\n",job_id);
			
			free(_job_id);
			return;
		}
	}
	else
		return;
    
    /* ----------------------------------------------------------- */  
    /* 1.- Set execution times & state transition                  */
    /* ----------------------------------------------------------- */  
    
    job->em_state = GW_EM_STATE_INIT;
    
    switch (job->job_state)
    {
    	case GW_JOB_STATE_PRE_WRAPPER:
    		
    		/* --------------- Update pre-wrapper stats -------------------- */
    	
		    job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_pre_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Pre-Wrapper DONE:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);

    		/* -------------- Transition to Wrapper state ------------------ */
    				    
		    gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_WRAPPER", _job_id);		    
    		break;
    		
    	case GW_JOB_STATE_WRAPPER:
    	
    		/* ----------------- Update wrapper stats ---------------------- */
    		
		    job->history->stats[WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Wrapper DONE:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);

    		/* -------------- Free used slot from this host -------------- */

            gw_host_dec_uslots(job->history->host, job->template.np);
void gw_dm_stop (void *_job_id)
{
    gw_job_t *   job;
    int          job_id;
    
	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (STOP).\n",job_id);

			gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_FAILED",  _job_id);
			return;
		}
	}
	else
		return;

	/* ----------------------------------------------------------- */  
    /* 1.- Stop the job                                            */
    /* ----------------------------------------------------------- */  
	
    switch (job->job_state)
    {
		case GW_JOB_STATE_STOPPED:
            
            gw_log_print("DM",'W',"Job %i already stopped.\n", job_id);
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_FAILED",  _job_id);
            break;
                        				
		case GW_JOB_STATE_WRAPPER:
		
	        if (job->history != NULL )
	            job->history->reason = GW_REASON_STOP_RESUME;

            gw_log_print("DM",'I',"Stopping job %i.\n", job_id);
            
			gw_job_set_state(job, GW_JOB_STATE_STOP_CANCEL, GW_FALSE);
			
			if ( job->reschedule == GW_TRUE )
			{
			    job->reschedule = GW_FALSE;
			    gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id);				
			}			
						
			gw_am_trigger(gw_dm.em_am, "GW_EM_CANCEL", _job_id);
			break;
			
        default:
                    
            gw_log_print("DM",'W',"Job %i can not be stopped in current state.\n", job_id);
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_STOP_FAILED",  _job_id);
            break;
    }

	pthread_mutex_unlock(&(job->mutex));
}	
Exemple #15
0
void gw_em_timer()
{
    int i;
    gw_job_t *job;
    time_t now;
    static int mark = 0;
    int *_job_id;
    gw_em_mad_t *mad;
    
    mark = mark + GW_EM_TIMER_PERIOD;
    if ( mark >= 300 )
    {
        gw_log_print("EM",'I',"-- MARK --\n");
    	mark = 0;
    }    
    
    now = time(NULL);

    for (i= 0; i< gw_conf.number_of_jobs ; i++)
    {
        job = gw_job_pool_get(i, GW_TRUE);
        
        if ( job != NULL )
        {   
            if ( job->history == NULL )
            {
                pthread_mutex_unlock(&(job->mutex));
                continue;
            }
        		        	               
            if ( (job->job_state == GW_JOB_STATE_PRE_WRAPPER) 
                    || (job->job_state == GW_JOB_STATE_WRAPPER)  
                    || (job->job_state == GW_JOB_STATE_MIGR_CANCEL)
                    || (job->job_state == GW_JOB_STATE_STOP_CANCEL)
                    || (job->job_state == GW_JOB_STATE_KILL_CANCEL))                                  
            {
            	if (issubmitted(job->em_state))
            	{
                    if ( now >= job->next_poll_time )
                    {
                        gw_log_print("EM",'I',"Checking execution state of job %i.\n", i);
                            
                        mad = job->history->em_mad;

                        /* Warning! When in Migration Cancel, the previous MAD should be used */
                        if (job->job_state == GW_JOB_STATE_MIGR_CANCEL)
                        {
                            if (job->history->next == NULL) 
                            {
                                gw_log_print("EM",'E',"Previous history record of job %i no longer exists\n", i);
                                pthread_mutex_unlock(&(job->mutex));                        
                                continue;
                            } 
                            else
                                mad = job->history->next->em_mad;
                        }                            
                                                     
                        gw_em_mad_poll(mad, i);
                    
                        job->next_poll_time += gw_conf.poll_interval; /* Wait for next poll */
                    }            		
            	}
            	else if ((job->em_state == GW_EM_STATE_FAILED)
               	             && (job->history->counter != -1))
            	{
            		job->history->counter--;
            		
            		if (job->history->counter == 0)
            		{
                           job->history->counter = -1;
            			
            		    _job_id    = (int *) malloc (sizeof(int));
            		    *(_job_id) = i;
            		    
            		    gw_am_trigger(&(gw_em.am),"GW_EM_SUBMIT", _job_id);
            		}
            	}            	            	            	
            }
            
            pthread_mutex_unlock(&(job->mutex));            
        }
    }
}
void gw_dm_aalloc     (void *_msg)
{
    gw_msg_submit_t  *msg_submit;
    gw_array_t       *array;
    gw_job_t         *job;
    gw_boolean_t     useradd;
    gw_job_state_t   init_state;
    int              fixed;
    int              tasks;
    
    int  rc;
    int  array_id, i, jid, uid;
    
    msg_submit  = (gw_msg_submit_t *) _msg;

    useradd = gw_user_pool_exists (msg_submit->msg.owner, msg_submit->msg.proxy_path, &uid) == GW_FALSE;
	
    if (useradd)
    {
    	rc = gw_user_pool_user_allocate (msg_submit->msg.owner, msg_submit->msg.proxy_path, &uid);
    	
    	if ( rc != 0 )
    	{
	        gw_log_print("DM",'E',"Could not register user %s.\n", msg_submit->msg.owner);
        
        	msg_submit->msg.rc = GW_RC_FAILED_USER;        
        	gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg));        
        	return;
    	}	
    }
    
    rc = gw_array_pool_array_allocate(&(msg_submit->msg), msg_submit->msg.number_of_tasks, &array_id);
        
    if ( rc != 0 )
    {
      gw_log_print("DM",'E',"Could not allocate array.\n");
                
      msg_submit->msg.rc = GW_RC_FAILED_NO_MEMORY;
      gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg));
                
      return;
    }

    array = gw_array_pool_get_array(array_id, GW_TRUE);

    if ( array == NULL )
    {
        msg_submit->msg.rc = GW_RC_FAILED_BAD_ARRAY_ID;
        gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg));
        
        return;
    }
    
    init_state = msg_submit->msg.init_state;
    tasks      = msg_submit->msg.number_of_tasks;
    
    for (i=0; i<msg_submit->msg.number_of_tasks; i++)
    {
        jid   = array->job_ids[i];
        job   = gw_job_pool_get(jid, GW_TRUE);
        
       	fixed = msg_submit->msg.fixed_priority;
       	
        gw_job_fill(job, msg_submit);
        
        if ( job == NULL )
        {
            pthread_mutex_unlock(&(array->mutex));        
			
            msg_submit->msg.rc = GW_RC_FAILED;
            gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg));
		    
            return;
        }        
    
    	/* --------- Set the initial state ---------- */
    
        if ( init_state == GW_JOB_STATE_PENDING )
            gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE);
        else
            gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE);
	    
        job->tm_state = GW_TM_STATE_INIT;
        job->em_state = GW_EM_STATE_INIT;        
        job->user_id  = uid;

        /* --------- Set the parameter values ---------- */
		
        job->pstart   = msg_submit->msg.pstart;
        job->pinc     = msg_submit->msg.pinc;
       
        /* --------- Set the initial priority ---------- */
    
        job->fixed_priority = msg_submit->msg.fixed_priority;
        
        pthread_mutex_unlock (&(job->mutex));    	
  
        /* ------------- Notify the scheduler ------------ */
    
        if ( init_state == GW_JOB_STATE_PENDING )
            gw_dm_mad_job_schedule(&gw_dm.dm_mad[0], jid, array_id,
                    uid, GW_REASON_NONE);
                      
        /* ------------- Set job dependencies ------------ */
        
        if ( msg_submit->jt.job_deps[0] != -1 )
            gw_job_pool_dep_set(jid, msg_submit->jt.job_deps);        
    }

    gw_user_pool_inc_jobs(uid,msg_submit->msg.number_of_tasks - useradd);
        
    pthread_mutex_unlock(&(array->mutex));
    
    msg_submit->msg.rc       = GW_RC_SUCCESS;    
    msg_submit->msg.array_id = array_id;  

    gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",&(msg_submit->msg));
    
    gw_log_print("DM",'I',"New array %i allocated and initialized.\n",array_id);
}
Exemple #17
0
void gw_em_listener(void *arg)
{
    fd_set in_pipes;
    int i,j;
    int *job_id;
    int greater, rc, rcm;
    char c;

    char info[GW_EM_MAX_INFO];
    char s_job_id[GW_EM_MAX_JOB_ID];
    char result[GW_EM_MAX_RESULT];
    char action[GW_EM_MAX_ACTION];
    char str[GW_EM_MAX_STRING];

    int fd;
    gw_job_t *job;
    time_t now;

    char contact_file[PATH_MAX];
    FILE *file;

    gw_em_mad_t *em_mad;

    char *ptmp;

    int *fds;
    int num_fds;
    gw_em_mad_t **em_mads;

    pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
    pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

    fds = (int *) malloc(sizeof(int)*gw_conf.number_of_users*GW_MAX_MADS);

    em_mads = (gw_em_mad_t **) malloc(sizeof(gw_em_mad_t *) *
                                      gw_conf.number_of_users * GW_MAX_MADS);
    while (1)
    {
        greater = gw_user_pool_set_em_pipes(&in_pipes, fds, &num_fds, em_mads, gw_em.um_em_pipe_r);

        rc = select( greater+1, &in_pipes, NULL, NULL, NULL);

        if ( rc <= 0 )
            continue;

        for (i=0; i<num_fds; i++)
        {
            fd = fds[i];

            if ( FD_ISSET(fd, &in_pipes) )
            {
                if ( fd == gw_em.um_em_pipe_r )
                {
                    rc = read(fd, (void *) &c, sizeof(char));
#ifdef GWEMDEBUG
                    gw_log_print("EM",'D',"Updating MAD pipes (action is %c)\n",c);
#endif
                    continue;
                }

#ifdef GWEMDEBUG
                gw_log_print("EM",'D',"Reading from MAD pipe %i.\n",i);
#endif

                j = 0;

                do
                {
                    rc = read(fd, (void *) &c, sizeof(char));
                    str[j++] = c;
                }
                while ((rc > 0) && (c != '\n') && (j < (GW_EM_MAX_STRING-1)));

                str[j] = '\0';

                if (rc <= 0)
                {
                    gw_log_print("EM",'W',"Error reading MAD (%s) message\n",
                                 em_mads[i]->name);

                    rcm = gw_em_mad_reload(em_mads[i]);

                    if ( rcm == 0 )
                    {
                        gw_log_print("EM",'I',"MAD (%s) successfully reloaded\n",
                                     em_mads[i]->name);

                        gw_job_pool_em_recover(em_mads[i], &(gw_em.am));
                    }
                    else
                    {
                        gw_log_print("EM",'E',"Error reloading MAD (%s)\n",
                                     em_mads[i]->name);

                        em_mads[i]->mad_em_pipe = -1;
                    }
                    continue;
                }

                sscanf(str,"%" GW2STR(GW_EM_MAX_ACTION) "s %"
                       GW2STR(GW_EM_MAX_JOB_ID) "s %"
                       GW2STR(GW_EM_MAX_RESULT) "s %"
                       GW2STR(GW_EM_MAX_INFO) "[^\n]",
                       action, s_job_id, result, info);

#ifdef GWEMDEBUG
                gw_log_print("EM",'D',"MAD message received:\"%s %s %s %s\".\n",
                             action, s_job_id, result, info);
#endif
                if (s_job_id[0] == '-')
                    continue;

                job_id = (int *) malloc (sizeof(int));

                *job_id = atoi(s_job_id);

                job = gw_job_pool_get(*job_id, GW_TRUE);

                if (job == NULL)
                {
                    gw_log_print("EM",'W',"MAD message for job %s, but it does not exist: \"%s %s %s %s\".\n",
                                 s_job_id,action, s_job_id, result, info);
                    free(job_id);

                    continue;
                }
                if (job->job_state != GW_JOB_STATE_PRE_WRAPPER
                        && job->job_state != GW_JOB_STATE_WRAPPER
                        && job->job_state != GW_JOB_STATE_MIGR_CANCEL
                        && job->job_state != GW_JOB_STATE_STOP_CANCEL
                        && job->job_state != GW_JOB_STATE_KILL_CANCEL)
                {
                    gw_log_print("EM",'W',"MAD message for job %i but not in an execution state.\n",
                                 *job_id);

                    free(job_id);
                    pthread_mutex_unlock(&(job->mutex));
                    continue;
                }
                else if ( job->em_state == GW_EM_STATE_HARD_KILL )
                {
                    gw_log_print("EM",'W',"MAD message for job %i but it is being killed (hard).\n",
                                 *job_id);

                    free(job_id);
                    pthread_mutex_unlock(&(job->mutex));
                    continue;
                }

                if (strcmp(action, "SUBMIT") == 0)
                {
                    if (strcmp(result, "FAILURE") == 0)
                    {
                        gw_job_print(job, "EM",'E',"Job submission failed: %s\n",
                                     info);
                        gw_log_print("EM",'E',"Submission of job %d failed: %s.\n",
                                     job->id, info);

                        gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED",
                                      (void *) job_id);
                    }
                    else /* Save persistent job contact */
                    {
                        snprintf(contact_file, PATH_MAX-1,
                                 GW_VAR_DIR "/%i/job.contact",
                                 job->id);

                        file = fopen(contact_file, "w");

                        if ( file != NULL )
                        {
                            fprintf(file, "%s\n", info);
                            fclose(file);
                        }

                        gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING",
                                      (void *) job_id);
                    }
                }
                else if (strcmp(action, "CANCEL") == 0)
                {
                    if (strcmp(result, "SUCCESS") == 0)
                    {
                        gw_job_print(job, "EM",'I',"Job cancel succeeded.\n");
                        gw_log_print("EM",'I',"Cancel of job %i succeeded.\n", *job_id);
                    }
                    else
                    {
                        gw_job_print(job, "EM",'E',"Job cancel failed (%s).\n",info);
                        gw_log_print("EM",'E',"Cancel of job %d failed: %s.\n",job->id, info);
                    }
                }
                else if (strcmp(action, "POLL") == 0)
                {
                    if (strcmp(result, "SUCCESS") == 0)
                    {
                        now = time(NULL);
                        job->next_poll_time = now + gw_conf.poll_interval/2
                                              + gw_rand(gw_conf.poll_interval);

                        gw_job_print(job, "EM",'E',"Job poll OK (%s), will poll again in %d seconds.\n",
                                     info, job->next_poll_time - now);

                        if (strcmp(info, "PENDING") == 0)
                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING",
                                          (void *) job_id);

                        else if (strcmp(info, "SUSPENDED") == 0)
                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_SUSPENDED",
                                          (void *) job_id);

                        else if (strcmp(info, "ACTIVE") == 0)
                            gw_am_trigger(&(gw_em.am),"GW_EM_STATE_ACTIVE",
                                          (void *) job_id);

                        else if (strstr(info, "DONE") != NULL)
                        {
                            ptmp = strstr(info,"DONE:");

                            if ((ptmp != NULL) && (strlen(ptmp+5) > 0))/*No-wrapper mode*/
                                job->exit_code=atoi(ptmp+5);

                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE",
                                          (void *) job_id);
                        }
                        else if (strcmp(info, "FAILED") == 0)
                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED",
                                          (void *) job_id);
                    }
                    else
                    {
                        job->history->failed_polls++;

                        em_mad = job->history->em_mad;

                        if ( job->history->failed_polls == 3 )
                        {
                            gw_job_print(job, "EM",'E',"Job poll failed (%s), assuming the job is done.\n",info);

                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id);
                        }
                        else
                        {
                            now = time(NULL);
                            job->next_poll_time = now + gw_conf.poll_interval*job->history->failed_polls
                                                  + gw_rand(gw_conf.poll_interval*job->history->failed_polls);

                            gw_job_print(job, "EM",'E',"Job poll failed (%s), will poll again in %d seconds.\n",
                                         info, job->next_poll_time - now);

                            free(job_id);
                        }
                    }
                }
                else if (strcmp(action, "RECOVER") == 0)
                {
                    if (strcmp(result, "SUCCESS") == 0)
                    {
                        if (strcmp(info, "PENDING") == 0)
                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_PENDING",
                                          (void *) job_id);
                        else if (strcmp(info, "SUSPENDED") == 0)
                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_SUSPENDED",
                                          (void *) job_id);
                        else if (strcmp(info, "ACTIVE") == 0)
                            gw_am_trigger(&(gw_em.am),"GW_EM_STATE_ACTIVE",
                                          (void *) job_id);
                        else if (strstr(info, "DONE") != NULL)
                        {
                            ptmp = strstr(info,"DONE:");

                            if ((ptmp != NULL) && (strlen(ptmp+5) > 0))/*No-wrapper mode*/
                                job->exit_code = atoi(ptmp+5);

                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE",
                                          (void *) job_id);
                        }
                        else if (strcmp(info, "FAILED") == 0)
                        {
                            /* Do not retry */
                            job->history->tries= job->template.number_of_retries;

                            gw_am_trigger(&(gw_em.am), "GW_EM_STATE_FAILED",
                                          (void *) job_id);
                        }
                    }
                    else
                    {
                        gw_job_print(job,"EM",'E',"Job recover failed (%s), assuming the job is done.\n", info);
                        gw_log_print("EM",'E',"Recover of job %i failed.\n", *job_id);

                        gw_am_trigger(&(gw_em.am), "GW_EM_STATE_DONE", (void *) job_id);
                    }
                }
Exemple #18
0
void gw_dm_wrapper_done_cb ( void *_job_id )
{
	gw_job_t *     job;
    int            job_id;
    time_t         total;
    time_t         active;
    time_t         suspension;

	/* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
	if ( _job_id != NULL )
	{
		job_id = *( (int *) _job_id );

		job = gw_job_pool_get(job_id, GW_TRUE);

		if ( job == NULL )
		{
			gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_WRAPPER_DONE_CB).\n",job_id);
			
			free(_job_id);
			return;
		}
	}
	else
		return;
    
    /* ----------------------------------------------------------- */  
    /* 1.- Set execution times & state transition                  */
    /* ----------------------------------------------------------- */  
    
    job->em_state = GW_EM_STATE_INIT;
    
    switch (job->job_state)
    {
    	case GW_JOB_STATE_PRE_WRAPPER:
    		
    		/* --------------- Update pre-wrapper stats -------------------- */
    	
		    job->history->stats[PRE_WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_pre_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Pre-Wrapper DONE:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);

    		/* -------------- Transition to Wrapper state ------------------ */
    				    
		    gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_WRAPPER", _job_id);		    
    		break;
    		
    	case GW_JOB_STATE_WRAPPER:
    	
    		/* ----------------- Update wrapper stats ---------------------- */
    		
		    job->history->stats[WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Wrapper DONE:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);

    		/* -------------- Free used slot from this host -------------- */

            gw_host_dec_uslots(job->history->host);
            
    		/* ---------- We do not need to re-schedule this job --------- */
    				    
			if ( job->reschedule == GW_TRUE )
			{
			    job->reschedule = GW_FALSE;
			    gw_dm_mad_job_del(&gw_dm.dm_mad[0],job->id);				
			}
			            		                
    		/* -------------- Transition to Epilog state ------------------ */
    				    
			gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_STD", _job_id);			
    		break;
    	
    	case GW_JOB_STATE_STOP_CANCEL:
    	
    		/* ----------------- Update wrapper stats ---------------------- */    	
    		
		    job->history->stats[WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);

    		/* -------------- Free used slot from this host -------------- */
    		
    		gw_host_dec_uslots(job->history->host);
			    		
    		/* ------------ Transition to Stop Epilog state --------------- */
    		
			gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_STOP_EPILOG", _job_id);		    
    		break;
    	
    	case GW_JOB_STATE_KILL_CANCEL:

    		/* ----------------- Update wrapper stats ---------------------- */
    		    	    	    	
		    job->history->stats[WRAPPER_EXIT_TIME] = time(NULL);
		    
		    total      = gw_job_history_get_wrapper_time(job->history);
		    active     = job->history->stats[ACTIVE_TIME];
    		suspension = job->history->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);
		    gw_job_print(job,"DM",'I',"\tTotal time      : %i\n", total);    	

    		/* -------------- Free used slot from this host -------------- */

            gw_host_dec_uslots(job->history->host);
			            
    		/* ------------ Transition to Kill Epilog state ---------------- */
    		
			gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_KILL_EPILOG", _job_id);		    
    		break;
    	
    	case GW_JOB_STATE_MIGR_CANCEL:

    		/* ----------- Update previous wrapper stats ------------------- */
    		    	
            job->history->next->stats[WRAPPER_EXIT_TIME] = time(NULL);
            
            active     = job->history->next->stats[ACTIVE_TIME];
            suspension = job->history->next->stats[SUSPENSION_TIME];

		    gw_job_print(job,"DM",'I',"Wrapper CANCELED:\n");
		    gw_job_print(job,"DM",'I',"\tActive time     : %i\n", active);
		    gw_job_print(job,"DM",'I',"\tSuspension time : %i\n", suspension);

    		/* -------------- Free used slot from previous host ------------ */
    		
            gw_host_dec_uslots(job->history->next->host);
	    			    
    		/* ---------- Transition to Migration Prolog state ------------ */
    		
			gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_MIGR_PROLOG", _job_id);		    
    		break;

    	default:
			gw_log_print("DM",'E',"Wrapper done callback in wrong job (%i) state.\n", job_id);
			
			free(_job_id);    	    	
    		break;    	
    }
    
    pthread_mutex_unlock(&(job->mutex));    
}
Exemple #19
0
void gw_em_submit(void *_job_id)
{
    int           job_id;
    gw_job_t      *job;
    char          *rsl=NULL;
    char          *contact;
    gw_job_state_t state;
    char          rsl_filename[2048];
    FILE          *fd;
    time_t        now;
	
    /* ----------------------------------------------------------- */  
    /* 0.- Get job pointer, check if it exits and lock mutex       */
    /* ----------------------------------------------------------- */  

    if ( _job_id != NULL )
    {
        job_id = *( (int *) _job_id );	
        job = gw_job_pool_get(job_id, GW_TRUE);
	
        if ( job == NULL )
        {
            gw_log_print("EM",'E',"Job %s no longer exists (PENDING).\n",
                    job_id);
            return;
        }
    }
    else
        return;

    if (job->history == NULL) 
    {
        gw_log_print("EM",'E',"History of job %s doesn't exists\n",
                job_id);
        free(_job_id);
        pthread_mutex_unlock(&(job->mutex));				
        return;
    }

    state = job->job_state;
	
    /* ----------------------------------------------------------- */  
    /* 1.- Get execution MAD for this host                         */
    /* ----------------------------------------------------------- */  

    job->em_state = GW_EM_STATE_INIT;
    job->history->counter = -1;    

    if ( job->job_state == GW_JOB_STATE_PRE_WRAPPER )
    {
        contact = job->history->em_fork_rc;
        rsl     = (char *) job->history->em_mad->pre_wrapper_rsl((void *) job);    
    }
    else
    {
        contact = job->history->em_rc;
        rsl     = (char *) job->history->em_mad->wrapper_rsl((void *) job);    		
    }
	
    if ( rsl == NULL )
    {
        job->em_state = GW_EM_STATE_FAILED;
        
        gw_log_print("EM",'E',"Job %i failed, could not generate RSL.\n", job_id);        
		gw_am_trigger(gw_em.dm_am, "GW_DM_WRAPPER_FAILED", _job_id);
        
        pthread_mutex_unlock(&(job->mutex));
        
        return;
    }
    
    sprintf(rsl_filename, "%s/job.rsl.%i", job->directory,job->restarted);
    
    fd = fopen(rsl_filename,"w");
    if (fd != NULL )
    {
        gw_job_print(job,"EM",'I',"Submitting wrapper to %s, RSL used is in %s.\n",contact,rsl_filename);
    	fprintf(fd,"%s",rsl);
    	fclose(fd);
    }
    else
    {
        job->em_state = GW_EM_STATE_FAILED;
        
        gw_log_print("EM",'E',"Job %i failed, could not open RSL file.\n", job_id);
        gw_job_print(job,"EM",'E',"Job failed, could not open RSL file %s.\n",rsl_filename);
		
        gw_am_trigger(gw_em.dm_am, "GW_DM_WRAPPER_FAILED", _job_id);
        
        pthread_mutex_unlock(&(job->mutex));
        
        return;    	
    }

    /* -------------------------------------------------------------------- */

    now = time(NULL);

    job->next_poll_time = now + gw_conf.poll_interval/2
            + gw_rand(gw_conf.poll_interval);            /* randomize polls */

    gw_job_print(job,"EM",'I',"Job will be polled in %d seconds.\n",
            job->next_poll_time-now);

    job->last_checkpoint_time = 0;
	
    job->history->stats[LAST_SUSPENSION_TIME] = now;
    job->history->stats[SUSPENSION_TIME]      = 0;
    job->history->stats[ACTIVE_TIME]          = 0;

    job->history->tries++;
        
    /* -------------------------------------------------------------------- */
    
    pthread_mutex_unlock(&(job->mutex));
    
    gw_em_mad_submit(job->history->em_mad, job_id, contact, rsl_filename);

    /* -------------------------------------------------------------------- */
    
    free(_job_id);
    
    free(rsl);
}
void gw_dm_jalloc     (void *_msg)
{
    gw_msg_t *   msg;
    gw_job_t *   job;
    int          jid;
    int          uid;
    int          rc;
    
	gw_boolean_t   useradd;    
    gw_job_state_t init_state;
	
    msg  = (gw_msg_t *) _msg;

	/* ------------- Check if user is already registered ------------ */
	    
	useradd = gw_user_pool_exists (msg->owner, &uid) == GW_FALSE;
	
    if (useradd)
    {
    	rc = gw_user_pool_user_allocate (msg->owner, &uid);
    	
    	if ( rc != 0 )
    	{
	        gw_log_print("DM",'E',"Could not register user %s.\n", msg->owner);
        
        	msg->rc = GW_RC_FAILED_USER;        
        	gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg);        
        	return;
    	}
    }

	/* ------------- Allocate job structure ------------ */
	    
    jid = gw_job_pool_allocate();
    
    if ( jid == -1 )
    {
        gw_log_print("DM",'E',"Could not allocate job.\n");
        
        msg->rc = GW_RC_FAILED;        
        gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg);        
        return;
    }
    
	/* ------------------------------------------ */
	/*               Update job data              */
	/* ------------------------------------------ */
		
    job = gw_job_pool_get(jid, GW_TRUE);
    
    if ( job == NULL )
    {
       	msg->rc = GW_RC_FAILED;
		gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg);        
    	
      	return;
    }  
    
    /* ------ Fill data using the template ------- */
    
    rc = gw_job_fill(job, msg);

    if ( rc == -1 )
    {
        gw_log_print("DM",'E',"Could not initialize job.\n");

        pthread_mutex_unlock(&(job->mutex));
        gw_job_pool_free(jid);
        
        msg->rc = GW_RC_FAILED;
        gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg);
        return;
    }
    
    /* --------- Set the initial state ---------- */
    
    init_state = msg->init_state;

    if ( init_state == GW_JOB_STATE_PENDING )
	    gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE);
	else
	    gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE);
	    
    job->tm_state  = GW_TM_STATE_INIT;
    job->em_state  = GW_EM_STATE_INIT;
    job->user_id   = uid;
    
    pthread_mutex_unlock(&(job->mutex));
    
    if (!useradd)
 	   gw_user_pool_inc_jobs(uid,1);
 	   
    if ( msg->jt.job_deps[0] != -1 )
    	gw_job_pool_dep_set(jid, msg->jt.job_deps); 	   
	
	/* ------------- Callback msg ------------ */
	
    msg->rc       = GW_RC_SUCCESS;
    msg->array_id = -1;    
    msg->job_id   = jid;
    
    gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg);
    
    /* ------------- Notify the scheduler ------------ */
    
    if ( init_state == GW_JOB_STATE_PENDING )
        gw_dm_mad_job_schedule(&gw_dm.dm_mad[0],
                               jid,
                               -1,
                               GW_REASON_NONE,
                               job->nice,
                               uid);

    gw_log_print("DM",'I',"New job %i allocated and initialized.\n", jid);
}
void gw_dm_aalloc     (void *_msg)
{
    gw_msg_t         *msg;
    gw_array_t       *array;
    gw_job_t         *job;
    gw_boolean_t     useradd;
    gw_job_state_t   init_state;
    int              tasks;
    
    int  rc;
    int  array_id, i, jid, uid;
    
    msg  = (gw_msg_t *) _msg;

	useradd = gw_user_pool_exists (msg->owner, &uid) == GW_FALSE;
	
    if (useradd)
    {
    	rc = gw_user_pool_user_allocate (msg->owner, &uid);
    	
    	if ( rc != 0 )
    	{
	        gw_log_print("DM",'E',"Could not register user %s.\n", msg->owner);
        
        	msg->rc = GW_RC_FAILED_USER;        
        	gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg);        
        	return;
    	}	
    }
    
    rc = gw_array_pool_array_allocate(msg, msg->number_of_tasks, &array_id);
        
    if ( rc != 0 )
    {
      gw_log_print("DM",'E',"Could not allocate array.\n");
                
      msg->rc = GW_RC_FAILED;        
      gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg);
                
      return;
    }

    array = gw_array_pool_get_array(array_id, GW_TRUE);

    if ( array == NULL )
    {
        msg->rc = GW_RC_FAILED;
		gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg);
        
        return;
    }
    
    init_state = msg->init_state;
    tasks      = msg->number_of_tasks;
    
    for (i=0; i<msg->number_of_tasks; i++)
    {
        jid = array->job_ids[i];
        job = gw_job_pool_get(jid, GW_TRUE);
        
        gw_job_fill(job, msg);
        
        if ( job == NULL )
        {
            pthread_mutex_unlock(&(array->mutex));        
			
			msg->rc = GW_RC_FAILED;
		    gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg);
		    
            return;
        }        

	    if ( init_state == GW_JOB_STATE_PENDING )
		    gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_FALSE);
		else
		    gw_job_set_state(job, GW_JOB_STATE_HOLD, GW_FALSE);
	    
  	    job->tm_state = GW_TM_STATE_INIT;
	    job->em_state = GW_EM_STATE_INIT;        
        job->user_id  = uid;

        job->pstart   = msg->pstart;
        job->pinc     = msg->pinc;
        
        pthread_mutex_unlock (&(job->mutex));    	
        
	    if ( msg->jt.job_deps[0] != -1 )
	    	gw_job_pool_dep_set(jid, msg->jt.job_deps);        
    }

    gw_user_pool_inc_jobs(uid,msg->number_of_tasks - useradd);
        
    pthread_mutex_unlock(&(array->mutex));
    
    msg->rc       = GW_RC_SUCCESS;    
    msg->array_id = array_id;  

    gw_am_trigger(gw_dm.rm_am,"GW_RM_SUBMIT",_msg);
    
    gw_log_print("DM",'I',"New array %i allocated and initialized.\n",array_id);
    
    if ( init_state == GW_JOB_STATE_PENDING )
        gw_dm_mad_array_schedule(&gw_dm.dm_mad[0], 
                                 -1,
                                 array_id,
                                 GW_REASON_NONE,
                                 0,
                                 uid,
                                 tasks);
}
Exemple #22
0
void gw_dm_zombie ( void *_job_id )
{
    gw_job_t *   job;
    gw_array_t * array;
    int          job_id;
    int          task_id;
    int          array_id;
    int          rt;
    char         conf_filename[2048];
    time_t       prolog, epilog;

    /* ----------------------------------------------------------- */  
    /* 0.- Get job pointer                                         */
    /* ----------------------------------------------------------- */  
    
    if ( _job_id == NULL )
        return;

    job_id = *( (int *) _job_id );

    job = gw_job_pool_get(job_id, GW_TRUE);

    if ( job == NULL )
    {
        gw_log_print("DM",'E',"Job %i does not exist (JOB_STATE_ZOMBIE).\n",job_id);

        free(_job_id);
        return;
    }

    /* ----------------------------------------------------------- */  
    /* 0.- Update Job state                                        */
    /* ----------------------------------------------------------- */  
    
    switch (job->job_state)
    {
        case GW_JOB_STATE_EPILOG:
        
            gw_job_set_state(job, GW_JOB_STATE_ZOMBIE, GW_FALSE);
       
            gw_log_print("DM",'I',"Job %i done, with exit code %i.\n",job->id, job->exit_code);

            job->history->reason = GW_REASON_NONE;
            job->exit_time       = time(NULL);

            /* ------------- Print job history and send usage ------------ */

            gw_job_print(job,"DM",'I',"Job done, history:\n");            
            gw_job_print_history(job);
            gw_job_send_usage(job);
                                
            if ( job->client_waiting > 0 )
                gw_am_trigger(gw_dm.rm_am,"GW_RM_WAIT_SUCCESS", _job_id);
            else
            {
                if (gw_conf.dispose == GW_TRUE)
                    gw_am_trigger(&(gw_dm.am), "GW_DM_KILL", _job_id);
                else
                    free(_job_id);
            }

            /* -------- Update User & Host running jobs -------- */
            
            gw_user_pool_dec_running_jobs(job->user_id);
            
            gw_host_dec_rjobs(job->history->host);
            
            /* --------       Notify the scheduler      -------- */
                                  
            prolog = gw_job_history_get_prolog_time(job->history);
            epilog = gw_job_history_get_epilog_time(job->history);

            gw_dm_mad_job_success(&gw_dm.dm_mad[0],
                    job->history->host->host_id,
                    job->user_id,
                    (prolog + epilog),
                    job->history->stats[SUSPENSION_TIME],
                    job->history->stats[ACTIVE_TIME]);
                           
            pthread_mutex_unlock(&(job->mutex));

            /* -------- Update other jobs dependencies -------- */
            gw_job_pool_dep_check(job_id);

            break;
                    
        case GW_JOB_STATE_KILL_EPILOG:
            
            gw_job_set_state(job, GW_JOB_STATE_ZOMBIE, GW_FALSE);            

            job->exit_time = time(NULL);

            /* ------------- Print job history and send usage ------------ */
            
            gw_job_print(job,"DM",'I',"Job killed, history:\n");
            gw_job_print_history(job);
            gw_job_send_usage(job);

            /* ---------------- Free job & Notify RM ---------------- */
            
            array_id = job->array_id;
            task_id  = job->task_id;            

            /* -------- Update User & Host running jobs -------- */
           
            gw_user_pool_dec_running_jobs(job->user_id);

            gw_host_dec_rjobs(job->history->host);
            
            sprintf(conf_filename, "%s/job.conf", job->directory);
            unlink(conf_filename);    

            pthread_mutex_unlock(&(job->mutex));

            /* ------------------------------------------------- */            
            
            gw_job_pool_free(job_id);
            
            gw_log_print("DM",'I',"Job %i killed and freed.\n", job_id);        

            if (array_id != -1)
            {
                array = gw_array_pool_get_array(array_id,GW_TRUE);
            
                if ( array != NULL )
                {                        
                    rt = gw_array_del_task(array,task_id);
                    pthread_mutex_unlock(&(array->mutex));
                    if (rt == 0)
                    {
                        gw_array_pool_array_free(array_id);
                        gw_log_print("DM",'I',"Array %i freed\n",array_id);
                    }
                }
                else
                    gw_log_print("DM",'E',"Could not delete task %i from array %i.\n",
                            task_id, array_id);
            }
            
            gw_am_trigger(gw_dm.rm_am,"GW_RM_KILL_SUCCESS", _job_id);
            break;

        default:
            gw_log_print("DM",'E',"Zombie callback in wrong job (%i) state.\n", job_id);

            free(_job_id);
            
            pthread_mutex_unlock(&(job->mutex));
            break;
    }
}