示例#1
0
int 
GetJobList(char *buffer, char **joblist_string)
{

	int  maxtok;
	char **tbuf;
	char *cp=NULL;

	maxtok=strtoken(buffer,'/',&tbuf);

	if(tbuf[1]){
		*joblist_string = strdup(tbuf[1]);
		if(*joblist_string == NULL){
			sysfatal("strdup failed for joblist_string in GetJobList: %r");
		}
		if ((cp = strrchr(*joblist_string, '\n')) != NULL) {
			*cp = '\0';
		} 
		if ((cp = strrchr(*joblist_string, '\r')) != NULL){
			*cp = '\0';
		}
	}

	freetoken(&tbuf,maxtok);

	return 0;
}
示例#2
0
char *
ComposeClassad(job_registry_entry *en)
{

	char *strudate=NULL;
	char *buffer=NULL;
	char *wn=NULL;
	char *excode=NULL;
	char *exreas=NULL;
	char *blahid=NULL;
	char *clientid=NULL;
        int  maxtok;
        char **tbuf;
	char *cp=NULL;
			
	if((buffer=calloc(STR_CHARS,1)) == 0){
		sysfatal("can't malloc buffer in PollDB: %r");
	}
		
	strudate=iepoch2str(en->udate);
	sprintf(buffer,"[BatchJobId=\"%s\"; JobStatus=%d; ChangeTime=\"%s\";",en->batch_id, en->status, strudate);
	free(strudate);

	if (strlen(en->wn_addr) > 0){
		wn=make_message(" WorkerNode=\"%s\";",en->wn_addr);
		strcat(buffer,wn);
		free(wn);
		}
	if (en->status == 3 || en->status == 4){
		excode=make_message(" JwExitCode=%d; Reason=\"reason=%d\";", en->exitcode, en->exitcode);
		strcat(buffer,excode);
		free(excode);
	}
	if (strlen(en->exitreason) > 0){
		exreas=make_message(" ExitReason=\"%s\";", en->exitreason);
		strcat(buffer,exreas);
		free(exreas);
	}
	if (strlen(en->user_prefix) > 0){
		maxtok=strtoken(en->user_prefix,'_',&tbuf);
		if(tbuf[1]){
			if ((cp = strrchr (tbuf[1], '\n')) != NULL){
				*cp = '\0';
			}
			if ((cp = strrchr (tbuf[1], '\r')) != NULL){
				*cp = '\0';
			}
			 clientid=make_message(" ClientJobId=\"%s\";",tbuf[1]);
		}
		blahid=make_message("%s BlahJobName=\"%s\";",clientid, en->user_prefix);
		strcat(buffer,blahid);
		free(blahid);
		freetoken(&tbuf,maxtok);
		free(clientid);
	}
	strcat(buffer,"]\n");
		
	return buffer;
		
}
示例#3
0
int 
NotifyStart(char *buffer, time_t *lastnotiftime)
{

	int  maxtok;
	char **tbuf;
	char *cp=NULL;
	char *notifdate=NULL;
	
	maxtok = strtoken(buffer,'/',&tbuf);

	if(tbuf[1]){
		notifdate=strdup(tbuf[1]);
		if(notifdate == NULL){
			sysfatal("strdup failed for notifdate in NotifyStart: %r");
		}
		if ((cp = strrchr (notifdate, '\n')) != NULL){
			*cp = '\0';
		}
		if ((cp = strrchr (notifdate, '\r')) != NULL){
			*cp = '\0';
		}
	}

	freetoken(&tbuf,maxtok);

	*lastnotiftime = str2epoch(notifdate,"S");
	free(notifdate);

	return 0;
}
示例#4
0
int 
GetFilter(char *buffer, const int conn_c, char **creamfilter)
{

	int  maxtok;
	char **tbuf;
	char *cp=NULL;
	char * out_buf;

	maxtok = strtoken(buffer,'/',&tbuf);

	if(tbuf[1]){
		*creamfilter = make_message("%s",tbuf[1]);
		if(*creamfilter == NULL){
			sysfatal("strdup failed for creamfilter in GetFilter: %r");
		}
		if ((cp = strrchr (*creamfilter, '\n')) != NULL) {
			*cp = '\0';
		} 
		if ((cp = strrchr (*creamfilter, '\r')) != NULL) {
			*cp = '\0';
		}
		out_buf = make_message("CREAMFILTER set to %s\n", *creamfilter);

	} else {
		out_buf = make_message("CREAMFILTER ERROR\n");
	}
		
	Writeline(conn_c, out_buf, strlen(out_buf));

	do_log(debuglogfile, debug, 1, "Sent Reply for CREAMFILTER command:%s\n",out_buf);

	freetoken(&tbuf,maxtok);
	free(out_buf);
	
	return 0;
}
示例#5
0
int AssignState (char *element, char *status, char *exit, char *reason, char *wn, char *udate){
    char **id_element;
    job_registry_entry en;
    time_t now;
    char *string_now=NULL;
    int i=0;
    int n=strtoken(element, '.', &id_element);
    int iret;
    
    if(id_element[0]){
	JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,id_element[0]);
	en.status=atoi(status);
	en.exitcode=atoi(exit);
	JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,wn);
	JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,reason);
	now=time(0);
	string_now=make_message("%d",now);
	JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now)
	en.udate=now;
	free(string_now);
    }else{
	if((element=calloc(STR_CHARS,1)) == 0){
	    sysfatal("can't malloc cmd in GetAndSend: %r");
	}
    }
    if ((iret=job_registry_update(rha, &en)) < 0){
	fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id);
	perror("");
    }else{
	if (en.status == REMOVED || en.status == COMPLETED){
	    job_registry_unlink_proxy(rha, &en);
	}
    }
    freetoken(&id_element,n);
    return 0;
}
示例#6
0
int
FinalStateQuery(char *input_string, int logs_to_read)
{
/*
tracejob -m -l -a <jobid>
In line:

04/23/2008 11:50:43  S    Exit_status=0 resources_used.cput=00:00:01 resources_used.mem=11372kb resources_used.vmem=52804kb
                          resources_used.walltime=00:10:15

there are:
udate for the final state (04/23/2008 11:50:43):
exitcode Exit_status=

*/

/*
 Filled entries:
 batch_id (a list of jobid is given, one for each tracejob call)
 status (always a final state 3 or 4)
 exitcode
 udate
 
 Filled by submit script:
 blah_id 
 
 Unfilled entries:
 exitreason
*/
/*
[root@cream-12 server_logs]# tracejob -m -l -a 13

Job: 13.cream-12.pd.infn.it

04/23/2008 11:40:27  S    enqueuing into cream_1, state 1 hop 1
04/23/2008 11:40:27  S    Job Queued at request of [email protected], owner = [email protected], job name =
                          cream_365713239, queue = cream_1
04/23/2008 11:40:28  S    Job Modified at request of [email protected]
04/23/2008 11:40:28  S    Job Run at request of [email protected]
04/23/2008 11:50:43  S    Exit_status=0 resources_used.cput=00:00:01 resources_used.mem=11372kb resources_used.vmem=52804kb
                          resources_used.walltime=00:10:15
04/23/2008 11:50:44  S    dequeuing from cream_1, state COMPLETE
*/

        FILE *fp;
	char *line=NULL;
	char **token;
	char **jobid;
	int maxtok_t=0,maxtok_j=0,k;
	job_registry_entry en;
	int ret;
	char *timestamp;
	time_t tmstampepoch;
	char *exit_str=NULL;
	int failed_count=0;
	int time_to_add=0;
	time_t now;
	char *cp=NULL;
	char *command_string=NULL;
	char *pbs_spool=NULL;
	char *string_now=NULL;
	int tracejob_line_counter=0;

	do_log(debuglogfile, debug, 3, "%s: input_string in FinalStateQuery is:%s\n",argv0,input_string);
	
	maxtok_j = strtoken(input_string, ':', &jobid);
	
	for(k=0;k<maxtok_j;k++){
	
		if(jobid[k] && strlen(jobid[k])==0) continue;

		pbs_spool=(pbs_spoolpath?make_message("-p %s ",pbs_spoolpath):make_message(""));
		command_string=make_message("%s%s/tracejob %s-m -l -a -n %d %s",batch_command,pbs_binpath,pbs_spool,logs_to_read,jobid[k]);
		free(pbs_spool);
		
		fp = popen(command_string,"r");
		
		do_log(debuglogfile, debug, 3, "%s: command_string in FinalStateQuery is:%s\n",argv0,command_string);

		/* en.status is set =0 (UNDEFINED) here and it is tested if it is !=0 before the registry update: the update is done only if en.status is !=0*/
		en.status=UNDEFINED;
		
		JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,jobid[k]);

		tracejob_line_counter=0;
		
		if(fp!=NULL){
			while(!feof(fp) && (line=get_line(fp))){
				if(line && strlen(line)==0){
					free(line);
					continue;
				}
				if(tracejob_line_counter>tracejob_max_output){
					do_log(debuglogfile, debug, 2, "%s: Tracejob output limit of %d lines reached. Skipping command.\n",argv0,tracejob_max_output);
					free(line);
					break;
				}
				if ((cp = strrchr (line, '\n')) != NULL){
					*cp = '\0';
					tracejob_line_counter++;
					
				}
                        	do_log(debuglogfile, debug, 3, "%s: line in FinalStateQuery is:%s\n",argv0,line);
				now=time(0);
				string_now=make_message("%d",now);
				if(line && (strstr(line,"Job deleted") || (strstr(line,"dequeuing from") && strstr(line,"state RUNNING")))){	
					maxtok_t = strtoken(line, ' ', &token);
					timestamp=make_message("%s %s",token[0],token[1]);
					tmstampepoch=str2epoch(timestamp,"A");
					free(timestamp);
					freetoken(&token,maxtok_t);
					en.udate=tmstampepoch;
					en.status=REMOVED;
                        		en.exitcode=-999;
					JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
					JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0");
				}else if(line && strstr(line," Exit_status=") && en.status != REMOVED){	
					maxtok_t = strtoken(line, ' ', &token);
					timestamp=make_message("%s %s",token[0],token[1]);
					tmstampepoch=str2epoch(timestamp,"A");
					exit_str=strdup(token[3]);
                			if(exit_str == NULL){
                        			sysfatal("strdup failed for exit_str in FinalStateQuery: %r");
                			}
					free(timestamp);
					freetoken(&token,maxtok_t);
					if(strstr(exit_str,"Exit_status=")){
						maxtok_t = strtoken(exit_str, '=', &token);
						if(maxtok_t == 2){
                        				en.exitcode=atoi(token[1]);
							freetoken(&token,maxtok_t);
						}else{
							en.exitcode=-1;
						}
					}else{
						en.exitcode=-1;
					}
					free(exit_str);
					en.udate=tmstampepoch;
					en.status=COMPLETED;
					JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
					JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0");
				}
				free(string_now);
				free(line);
			}
			pclose(fp);
		}
		
		if(en.status !=UNDEFINED && en.status!=IDLE){
			if ((ret=job_registry_update_select(rha, &en,
			JOB_REGISTRY_UPDATE_UDATE |
			JOB_REGISTRY_UPDATE_STATUS |
			JOB_REGISTRY_UPDATE_UPDATER_INFO |
			JOB_REGISTRY_UPDATE_EXITCODE |
			JOB_REGISTRY_UPDATE_EXITREASON )) < 0){
				if(ret != JOB_REGISTRY_NOT_FOUND){
					fprintf(stderr,"Update of record returns %d: ",ret);
					perror("");
				}
			} else {
				do_log(debuglogfile, debug, 2, "%s: registry update in FinalStateQuery for: jobid=%s exitcode=%d status=%d\n",argv0,en.batch_id,en.exitcode,en.status);
				if (en.status == REMOVED || en.status == COMPLETED){
					job_registry_unlink_proxy(rha, &en);
				}
				if (remupd_conf != NULL){
					if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){
						do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0);
					}
				}
			}
		}else{
			failed_count++;
		}		
		free(command_string);
	}
	
	now=time(0);
	if(failed_count>10){
		failed_count=10;
	}
	time_to_add=pow(failed_count,1.5);
	next_finalstatequery=now+time_to_add;
	do_log(debuglogfile, debug, 3, "%s: next FinalStatequery will be in %d seconds\n",argv0,time_to_add);
	
	freetoken(&jobid,maxtok_j);
	return failed_count;
}
示例#7
0
int
IntStateQuery()
{
/*
qstat -f

Job Id: 11.cream-12.pd.infn.it
    Job_Name = cream_579184706
    job_state = R
    ctime = Wed Apr 23 11:39:55 2008
    exec_host = cream-wn-029.pn.pd.infn.it/0
*/

/*
 Filled entries:
 batch_id
 wn_addr
 status
 udate
 
 Filled by submit script:
 blah_id 
 
 Unfilled entries:
 exitreason
*/


        FILE *fp;
	char *line=NULL;
	char **token;
	int maxtok_t=0;
	job_registry_entry en;
	int ret;
	char *timestamp;
	time_t tmstampepoch;
	char *batch_str=NULL;
	char *wn_str=NULL; 
        char *twn_str=NULL;
        char *status_str=NULL;
	char *ex_str=NULL;
	int  ex_code=0; 
	char *cp=NULL;
	char *command_string=NULL;
	job_registry_entry *ren=NULL;
	int first=TRUE;
	time_t now;
	char *string_now=NULL;

	command_string=make_message("%s%s/qstat -f",batch_command,pbs_binpath);
	fp = popen(command_string,"r");

	en.status=UNDEFINED;
	JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0");
	JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0");
	JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0");
	en.exitcode=-1;
	bupdater_free_active_jobs(&bact);

	if(fp!=NULL){
		while(!feof(fp) && (line=get_line(fp))){
			if(line && strlen(line)==0){
				free(line);
				continue;
			}
			if ((cp = strrchr (line, '\n')) != NULL){
				*cp = '\0';
			}
			do_log(debuglogfile, debug, 3, "%s: line in IntStateQuery is:%s\n",argv0,line);
			now=time(0);
			string_now=make_message("%d",now);
			if(line && strstr(line,"Job Id: ")){
				if(!first && en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){
                        		if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum,
					JOB_REGISTRY_UPDATE_WN_ADDR|
					JOB_REGISTRY_UPDATE_STATUS|
					JOB_REGISTRY_UPDATE_UDATE|
					JOB_REGISTRY_UPDATE_UPDATER_INFO|
					JOB_REGISTRY_UPDATE_EXITCODE|
					JOB_REGISTRY_UPDATE_EXITREASON)) < 0){
						if(ret != JOB_REGISTRY_NOT_FOUND){
                	                		fprintf(stderr,"Update of record returns %d: ",ret);
							perror("");
						}
					} else {
						if(ret==JOB_REGISTRY_SUCCESS){
							if (en.status == REMOVED || en.status == COMPLETED) {
								do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.wn_addr,en.status,en.exitcode);
								job_registry_unlink_proxy(rha, &en);
							}else{
								do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.wn_addr,en.status);
							}
						}
						if (remupd_conf != NULL){
							if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){
								do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0);
							}
						}
					}
					en.status = UNDEFINED;
					JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0");
					JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0");
					JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0");
					en.exitcode=-1;
				}				
                        	maxtok_t = strtoken(line, ':', &token);
				batch_str=strdel(token[1]," ");
				JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,batch_str);
				JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
				en.exitcode=-1;
				bupdater_push_active_job(&bact, en.batch_id);
				free(batch_str);
				freetoken(&token,maxtok_t);
				if(!first) free(ren);
				if ((ren=job_registry_get(rha, en.batch_id)) == NULL){
						fprintf(stderr,"Get of record returns error for %s ",en.batch_id);
						perror("");
				}
				first=FALSE;				
			}else if(line && strstr(line,"job_state = ")){	
				maxtok_t = strtoken(line, '=', &token);
				status_str=strdel(token[1]," ");
				if(status_str && strcmp(status_str,"Q")==0){ 
					en.status=IDLE;
					en.exitcode=-1;
					JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0");
				}else if(status_str && strcmp(status_str,"W")==0){ 
					en.status=IDLE;
					en.exitcode=-1;
				}else if(status_str && strcmp(status_str,"R")==0){ 
					en.status=RUNNING;
					en.exitcode=-1;
				}else if(status_str && strcmp(status_str,"C")==0){ 
					en.status=COMPLETED;
					JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0");
				}else if(status_str && strcmp(status_str,"H")==0){ 
					en.status=HELD;
					en.exitcode=-1;
					JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0");
				}
				free(status_str);
				freetoken(&token,maxtok_t);
			}else if(line && strstr(line,"unable to run job")){
				en.status=IDLE;	
				en.exitcode=-1;
			}else if(line && strstr(line,"exit_status = ")){
				maxtok_t = strtoken(line, '=', &token);
				ex_str=strdel(token[1]," ");
				ex_code=atoi(ex_str);
				if(ex_code==0){
					en.exitcode=0;
				}else if(ex_code==271){
					en.status=REMOVED;
                        		en.exitcode=-999;
				}else{
					en.exitcode=ex_code;
				}
				free(ex_str);
				freetoken(&token,maxtok_t);
			}else if(line && strstr(line,"exec_host = ")){	
				maxtok_t = strtoken(line, '=', &token);
				twn_str=strdup(token[1]);
                		if(twn_str == NULL){
                        		sysfatal("strdup failed for twn_str in IntStateQuery: %r");
                		}
				freetoken(&token,maxtok_t);
				maxtok_t = strtoken(twn_str, '/', &token);
				wn_str=strdel(token[0]," ");
				JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,wn_str);
				free(twn_str);
 				free(wn_str);
				freetoken(&token,maxtok_t);
			}else if(line && strstr(line,"mtime = ")){	
                        	maxtok_t = strtoken(line, ' ', &token);
				timestamp=make_message("%s %s %s %s %s",token[2],token[3],token[4],token[5],token[6]);
                        	tmstampepoch=str2epoch(timestamp,"L");
				free(timestamp);
				en.udate=tmstampepoch;
				freetoken(&token,maxtok_t);
			}
			free(line);
			free(string_now);

		}
		pclose(fp);
	}
	
	if(en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){
		if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum,
		JOB_REGISTRY_UPDATE_WN_ADDR|
		JOB_REGISTRY_UPDATE_STATUS|
		JOB_REGISTRY_UPDATE_UDATE|
		JOB_REGISTRY_UPDATE_UPDATER_INFO|
		JOB_REGISTRY_UPDATE_EXITCODE|
		JOB_REGISTRY_UPDATE_EXITREASON)) < 0){
			if(ret != JOB_REGISTRY_NOT_FOUND){
				fprintf(stderr,"Update of record returns %d: ",ret);
				perror("");
			}
		} else {
			if(ret==JOB_REGISTRY_SUCCESS){
				if (en.status == REMOVED || en.status == COMPLETED) {
					do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.wn_addr,en.status,en.exitcode);
					job_registry_unlink_proxy(rha, &en);
				}else{
					do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.wn_addr,en.status);
				}
			}
			if (remupd_conf != NULL){
				if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){
					do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0);
				}
			}
		}
	}				

	free(ren);
	free(command_string);
	return 0;
}
示例#8
0
int
PollDB()
{
        FILE *fd;
        job_registry_entry *en;
	job_registry_handle *rha;
	job_registry_handle *rhc;
	char *buffer=NULL;
        char *cdate=NULL;
	time_t now;
        int  maxtok,i,maxtokl,j;
        char **tbuf;
        char **lbuf;
	int len=0,flen=0;
        struct stat sbuf;
        int rc;
	char *regfile;
        char *cp=NULL;
	int to_sleep=FALSE;
	int skip_reg_open=FALSE;
	int ret;

	rha=job_registry_init(registry_file, BY_BATCH_ID);
	if (rha == NULL){
		do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,registry_file);
		fprintf(stderr,"%s: Error initialising job registry %s :",argv0,registry_file);
		perror("");
	}
	
	for(;;){
	
		now=time(NULL);
	
		to_sleep=TRUE;
		/* cycle over connections: sleep if startnotify, startnotifyjob and sentendonce are not set.
		   If startnotifyjob is set the conn is served.
		*/ 
		for(i=0; i<MAX_CONNECTIONS; i++){
		
			if(!connections[i].startnotify && !connections[i].startnotifyjob && !(connections[i].firstnotify && connections[i].sentendonce)) continue;
			if(connections[i].startnotify) to_sleep=FALSE;
			
			if(connections[i].startnotifyjob){
				to_sleep=FALSE;
				rhc=job_registry_init(registry_file, BY_USER_PREFIX);
				if (rhc == NULL){
					do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,registry_file);
					fprintf(stderr,"%s: Error initialising job registry %s :",argv0,registry_file);
		 	   	  	perror("");
		 	   	}
		 	   	do_log(debuglogfile, debug, 2, "%s:Job list for notification:%s\n",argv0,connections[i].joblist_string);
		 	   	maxtok=strtoken(connections[i].joblist_string,',',&tbuf);
   		 	   	for(j=0;j<maxtok;j++){
        	 	   	  	if ((en=job_registry_get(rhc, tbuf[j])) != NULL){
						buffer=ComposeClassad(en);
		 	   	  	}else{
						if(remupd_conf == NULL){
		 	   	  			cdate=iepoch2str(now);
		 	   	  			maxtokl=strtoken(tbuf[j],'_',&lbuf);
		 	   	  			if(lbuf[1]){
		 	   	  				if ((cp = strrchr (lbuf[1], '\n')) != NULL){
		 	   	  					*cp = '\0';
		 	   	  				}
		 	   	  				if ((cp = strrchr (lbuf[1], '\r')) != NULL){
		 	   	  					*cp = '\0';
		 	   	  				}
		 	   	  				buffer=make_message("[BlahJobName=\"%s\"; ClientJobId=\"%s\"; JobStatus=4; JwExitCode=999; ExitReason=\"BUpdater is not able to find the job anymore\"; Reason=\"BUpdater is not able to find the job anymore\"; ChangeTime=\"%s\"; ]\n",tbuf[j],lbuf[1],cdate);
		 	   	  			}
		 	   	  			freetoken(&lbuf,maxtokl);
		 	   	  			free(cdate);
						}else{
		 	   	  			maxtokl=strtoken(tbuf[j],':',&lbuf);
							JOB_REGISTRY_ASSIGN_ENTRY(en->batch_id,lbuf[0]);
							JOB_REGISTRY_ASSIGN_ENTRY(en->blah_id,lbuf[1]);
		 	   	  			freetoken(&lbuf,maxtokl);
							en->status = 0;
							if ((ret=job_registry_append(rhc, en))<0){
								if(ret != JOB_REGISTRY_NOT_FOUND){
									fprintf(stderr,"Update of record returns %d: ",ret);
									perror("");
								}
							}else{
								if(ret==JOB_REGISTRY_SUCCESS){
									do_log(debuglogfile, debug, 2, "%s: registry append in PollDB for: jobid=%s blahjobid=%s\n",argv0,en->batch_id,en->blah_id);
								}
							}
						}
		 	   	  	}
		 	   	  	free(en);
		 	   	  	len=strlen(buffer);
		 	   	  	if(connections[i].finalbuffer != NULL){
		 	   	  		flen=strlen(connections[i].finalbuffer);
		 	   	  	}else{
		 	   	  		flen=0;
		 	   	  	}
		 	   	  	connections[i].finalbuffer = realloc(connections[i].finalbuffer,flen+len+2);
		 	   	  	if (connections[i].finalbuffer == NULL){
		 	   	  		sysfatal("can't realloc finalbuffer in PollDB: %r");
		 	   	  	}
		 	   	  	if(flen==0){
		 	   	  		connections[i].finalbuffer[0]='\000';
					}
		 	   	  	strcat(connections[i].finalbuffer,buffer);
		 	   	  	free(buffer);
		 	   	}
		 	   	freetoken(&tbuf,maxtok);
		 	   
		 	   	if(connections[i].finalbuffer != NULL){
		 	   	  	if(NotifyCream(connections[i].finalbuffer,&connections[i])!=-1){
	         	   	  		/* change last notification time */
		 	   	  		connections[i].lastnotiftime=now;
		 	   	  		connections[i].startnotifyjob=FALSE;
		 	   	  	}
		 	   	  	free(connections[i].finalbuffer);
		 	   	  	connections[i].finalbuffer=NULL;
		 	   	}
		 	   	job_registry_destroy(rhc);
			}
			if(connections[i].firstnotify && connections[i].sentendonce){
				to_sleep=FALSE;
				if(NotifyCream("NTFDATE/END\n",&connections[i])!=-1){
					connections[i].startnotify=TRUE;
					connections[i].sentendonce=FALSE;
		 	   	  	connections[i].firstnotify=FALSE;
		 	   	  	connections[i].startnotifyjob=FALSE;
				}
			}
			
		}
		
		if(to_sleep){
			sleep(loop_interval);
			continue;
		}

                regfile=make_message("%s/registry",registry_file);
        	rc=stat(regfile,&sbuf);
		free(regfile);
		
		skip_reg_open=TRUE;
		for(i=0; i<MAX_CONNECTIONS; i++){
			if(sbuf.st_mtime>=connections[i].lastnotiftime){
				skip_reg_open=FALSE;
				break;
			}
		}
		if(skip_reg_open){
			do_log(debuglogfile, debug, 3, "Skip registry opening: mtime:%d lastn:%d\n",sbuf.st_mtime,connections[i].lastnotiftime);
			sleep(loop_interval);
			continue;
		}
		
		do_log(debuglogfile, debug, 3, "Normal registry opening\n");

		fd = job_registry_open(rha, "r");
		if (fd == NULL)
		{
			do_log(debuglogfile, debug, 1, "%s: Error opening job registry %s\n",argv0,registry_file);
			fprintf(stderr,"%s: Error opening job registry %s :",argv0,registry_file);
			perror("");
			sleep(loop_interval);
			continue;
		}
		if (job_registry_rdlock(rha, fd) < 0)
		{
			do_log(debuglogfile, debug, 1, "%s: Error read locking registry %s\n",argv0,registry_file);
			fprintf(stderr,"%s: Error read locking registry %s :",argv0,registry_file);
			perror("");
			sleep(loop_interval);
			continue;
		}
		while ((en = job_registry_get_next(rha, fd)) != NULL)
		{
		
			for(i=0; i<MAX_CONNECTIONS; i++){
				if(connections[i].creamfilter==NULL) continue;
				if(en->mdate >= connections[i].lastnotiftime && en->mdate < now && en->user_prefix && strstr(en->user_prefix,connections[i].creamfilter)!=NULL && strlen(en->updater_info)>0)
				{
					buffer=ComposeClassad(en);
					len=strlen(buffer);
					if(connections[i].finalbuffer != NULL){
						flen=strlen(connections[i].finalbuffer);
					}else{
						flen=0;
					}
					connections[i].finalbuffer = realloc(connections[i].finalbuffer,flen+len+2);
					if (connections[i].finalbuffer == NULL){
						sysfatal("can't realloc finalbuffer in PollDB: %r");
					}
					if(flen==0){
						connections[i].finalbuffer[0]='\000';
					}
					strcat(connections[i].finalbuffer,buffer);
					free(buffer);
				}
			}
			free(en);
		}

		for(i=0; i<MAX_CONNECTIONS; i++){
			if(connections[i].finalbuffer != NULL){
				if(NotifyCream(connections[i].finalbuffer,&connections[i])!=-1){
	        			/* change last notification time */
					connections[i].lastnotiftime=now;
				}
				free(connections[i].finalbuffer);
				connections[i].finalbuffer=NULL;
			}
		}
		
		fclose(fd);
		
		sleep(loop_interval);
	}
                
	job_registry_destroy(rha);
	
	return 0;
}
示例#9
0
int FinalStateQuery(char *query,char *queryStates, char *query_err){

    char line[STR_CHARS],fail[6],qExit[10],qFailed[10],qHostname[100],qStatus[2],command_string[100];
    char **saveptr1,**saveptr2,**list_query,**list_queryStates;
    FILE *file_output;
    int numQuery=0,numQueryStates=0,j=0,l=0,cont=0,cont2=0, nq=0;
    time_t now;
    char string_now[11];
    job_registry_entry en;
    int iret;
    
    numQuery=strtoken(query,' ',&list_query);
    nq=numQuery;
    numQueryStates=strtoken(queryStates,' ',&list_queryStates);
    if (numQuery!=numQueryStates) return 1;
    
    sprintf(command_string,"%s/qstat -u '*'",sge_binpath);
    if (debug) do_log(debuglogfile, debug, 1, "+-+line 433, command_string:%s\n",command_string);
    
    //load in qstatJob list of jobids from qstat command exec
    file_output = popen(command_string,"r");
    if (file_output == NULL) return 0;
    while (fgets(line,sizeof(line), file_output) != NULL){
	cont=strtoken(line, ' ', &saveptr1);
	if ((strcmp(saveptr1[0],"job-ID")!=0)&&(strncmp(saveptr1[0],"-",1)!=0)){
	    for (l=0;l<nq;l++){
		if (strcmp(list_query[l],saveptr1[0])==0){
		    if (strcmp(list_queryStates[l],saveptr1[4])!=0){
			now=time(0);
			sprintf(string_now,"%d",now);
			if (strcmp(saveptr1[4],"u")==0){
			    JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]);
			    en.status=0;
			    en.exitcode=0;
			    JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"");
			    JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"0");
			    JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now)
			    en.udate=now;
			    if ((iret=job_registry_update(rha, &en)) < 0){
				fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id);
				perror("");
			    }
			}
			if (strcmp(saveptr1[4],"q")==0){
			    JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]);
			    en.status=1;
			    en.exitcode=0;
			    JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"");
			    JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"0");
			    JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now)
			    en.udate=now;
			    if ((iret=job_registry_update(rha, &en)) < 0){
				fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id);
				perror("");
			    }
			}
			if (strcmp(saveptr1[4],"r")==0){
			    cont2=strtoken(saveptr1[7], '@', &saveptr2);
			    JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]);
			    en.status=2;
			    en.exitcode=0;
			    JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,saveptr2[1]);
			    JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"0");
			    JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now)
			    en.udate=now;
			    if ((iret=job_registry_update(rha, &en)) < 0){
				fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id);
				perror("");
			    }
			    freetoken(&saveptr2,cont2);
			}
			if ((strcmp(saveptr1[4],"hr")==0)||strcmp(saveptr1[4],"hqw")==0){
			    JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]);
			    en.status=5;
			    en.exitcode=0;
			    JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"");
			    JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"0");
			    JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now)
			    en.udate=now;
			    if ((iret=job_registry_update(rha, &en)) < 0){
				fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id);
				perror("");
			    }
			}
		    }
		    //i must put out element from query
		    for (j=l;j<nq;j++)
			if (list_query[j+1]!=NULL) strcpy(list_query[j],list_query[j+1]);
		    for (j=l;j<nq;j++)
			if (list_queryStates[j+1]!=NULL) strcpy(list_queryStates[j],list_queryStates[j+1]);
		    nq--;
		    break;
		}
	    }
	}
	line[0]='\0';
	freetoken(&saveptr1,cont);
    }
    pclose( file_output );
    sprintf(query_err,"\0");
    //now we have check in list_query only states that not change status 
    //because they're not in qstat result
    for (l=0; l<nq; l++){
	sprintf(command_string,"%s/qacct -j '%s'",sge_binpath,list_query[l]);
	if (debug) do_log(debuglogfile, debug, 1, "+-+line 520,command_string:%s\n",command_string);
	file_output = popen(command_string,"r");
	if (file_output == NULL) return 1;
	//if a job number is here means that job was in query previously and
	//if now it's not in query and not finished (NULL qstat) it was deleted 
	//or it's on transition time
	if (fgets( line,sizeof(line), file_output )==NULL){
	    strcat(query_err,list_query[l]);
	    strcat(query_err," ");
	    pclose( file_output );
	    continue;
	}

	//there is no problem to lost first line with previous fgets, because 
	//it's only a line of =============================================
	while (fgets( line,sizeof(line), file_output )!=NULL){
	    cont=strtoken(line, ' ', &saveptr1);
	    if (strcmp(saveptr1[0],"hostname")==0) strcpy(qHostname,saveptr1[1]);;
	    if (strcmp(saveptr1[0],"failed")==0) strcpy(qFailed,saveptr1[1]);
	    if (strcmp(saveptr1[0],"exit_status")==0) strcpy(qExit,saveptr1[1]);
	    freetoken(&saveptr1,cont);
	}
	pclose( file_output );
	now=time(0);
	sprintf(string_now,"%d",now);
	if ((strcmp(qExit,"137")==0)||(strcmp(qExit,"143")==0)){
	    JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]);
	    en.status=3;
	    en.exitcode=atoi(qExit);
	    JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,qHostname);
	    JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"");
	    JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now)
	    en.udate=now;
	    if ((iret=job_registry_update(rha, &en)) < 0){
		fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id);
		perror("");
	    }else job_registry_unlink_proxy(rha, &en);
	}else{
	    JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]);
	    en.status=4;
	    en.exitcode=atoi(qExit);
	    JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,qHostname);
	    JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,qFailed);
	    JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now)
	    en.udate=now;
	    if ((iret=job_registry_update(rha, &en)) < 0){
		fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id);
		perror("");
	    }else job_registry_unlink_proxy(rha, &en);
	}
    }
    freetoken(&list_query,numQuery);
    freetoken(&list_queryStates,numQueryStates);
    if (debug) do_log(debuglogfile, debug, 1, "+-+query_err:%s\n",query_err);
    //now check acumulated error jobids to verify if they are an error or not
    if (strcmp(query_err,"\0")!=0){
	sleep(60);
	cont=0;
	int n=0;
	char cmd[10]="\0";
	
	cont=strtoken(query_err, ' ', &list_query);
	
	while (n < cont){
	    if(list_query[n]) strcpy(cmd,list_query[n]);
	    else return 1;
	    sprintf(command_string,"%s/qacct -j '%s'",sge_binpath,cmd);
	    if (debug) do_log(debuglogfile, debug, 1, "+-+line 587 error, command_string:%s\n",command_string);
	    file_output = popen(command_string,"r");
	    if (file_output == NULL) return 1;

	    //if a job number is here means that job was in query previously and
	    //if now it's not in query and not finished (NULL qstat) it was deleted 
	    if (fgets( line,sizeof(line), file_output )==NULL){
		JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,cmd);
		en.status=3;
		en.exitcode=3;
		JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"");
		JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"reason=3");
		now=time(0);
		sprintf(string_now,"%d",now);
		JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now)
		en.udate=now;
		if ((iret=job_registry_update(rha, &en)) < 0){
		    fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id);
		    perror("");
		}else job_registry_unlink_proxy(rha, &en);
		pclose( file_output );
		n++;
		continue;
	    }
	    //there is no problem to lost first line with previous fgets, because 
	    //it's only a line of =============================================
	    while (fgets( line,sizeof(line), file_output )!=NULL){
		cont=strtoken(line, ' ', &saveptr1);
		if (strcmp(saveptr1[0],"hostname")==0) strcpy(qHostname,saveptr1[1]);
		if (strcmp(saveptr1[0],"failed")==0) strcpy(qFailed,saveptr1[1]);
		if (strcmp(saveptr1[0],"exit_status")==0) strcpy(qExit,saveptr1[1]);
		freetoken(&saveptr1,cont);
	    }
	    pclose( file_output );
	    now=time(0);
	    sprintf(string_now,"%d",now);
	    if ((strcmp(qExit,"137")==0)||(strcmp(qExit,"143")==0)){
		JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,cmd);
		en.status=3;
		en.exitcode=atoi(qExit);
		JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,qHostname);
		JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"");
		JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now)
		en.udate=now;
		if ((iret=job_registry_update(rha, &en)) < 0){
		    fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id);
		    perror("");
		}else job_registry_unlink_proxy(rha, &en);
	    }else{
		JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,cmd);
		en.status=4;
		en.exitcode=atoi(qExit);
		JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,qHostname);
		JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,qFailed);
		JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now)
		en.udate=now;
		if ((iret=job_registry_update(rha, &en)) < 0){
		    fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id);
		    perror("");
		}else job_registry_unlink_proxy(rha, &en);
	    }
	    n++;
	}
	freetoken(&list_query,cont);
    }
    return 0;
}
示例#10
0
int main(int argc, char *argv[]){
    
    FILE *fd;
    job_registry_entry *en;
    time_t now;
    time_t purge_time=0;
    char constraint[JOBID_MAX_LEN+1];
    char constraint2[5];
    char *query=NULL;
    char *queryStates=NULL;
    char *query_err=NULL;

    char *pidfile=NULL;
    char string_now[11];
    char *tpath;
    
    int version=0;
    int tmptim;
    int finstr_len=0;
    int loop_interval=DEFAULT_LOOP_INTERVAL;
    
    int fsq_ret=0;
    
    int c;
    
    int confirm_time=0;
    
    static int help;
    static int short_help;
    
    while (1) {
	static struct option long_options[] =
	{
	    {"help",      no_argument,     &help,       1},
	    {"usage",     no_argument,     &short_help, 1},
	    {"nodaemon",  no_argument,       0, 'o'},
	    {"version",   no_argument,       0, 'v'},
	    {0, 0, 0, 0}
	};
	
	int option_index = 0;
	
	c = getopt_long (argc, argv, "vo",long_options, &option_index);
	
	if (c == -1){
	    break;
	}
	
	switch (c)
	{
	    
	    case 0:
		if (long_options[option_index].flag != 0){
		    break;
		}
		
	    case 'v':
		version=1;
		break;
		
	    case 'o':
		nodmn=1;
		break;
		
	    case '?':
		break;
		
	    default:
		abort ();
	}
    }
    
    //check if another instance is running 
    char **ptr;
    char out[3];
    fgets(out, sizeof(out),popen("ps -d | grep -c BUpdaterSGE","r"));
    strtoken(out,'\n',&ptr);
    if (strcmp(ptr[0],"1")!=0){
	fprintf(stderr,"There is another instance of BUpdaterSGE running.\nExiting ...\n");
	return -1;
    }
    freetoken(&ptr,1);

    if(help){
	usage();
    }
    
    if(short_help){
	short_usage();
    }
    
    argv0 = argv[0];
    
    signal(SIGHUP,sighup);
    
    if(version) {
	printf("%s Version: %s\n",progname,VERSION);
	exit(EXIT_SUCCESS);
    }  
    
    /* Checking configuration */
    check_config_file("UPDATER"); 
    
    cha = config_read(NULL);
    if (cha == NULL)
    {
	fprintf(stderr,"Error reading config: ");
	perror("");
	return -1;
    }
    config_setenv(NULL);
    
    ret = config_get("bupdater_child_poll_timeout",cha);
    if (ret != NULL){
	tmptim=atoi(ret->value);
	if (tmptim > 0) bfunctions_poll_timeout = tmptim*1000;
    }
    
    ret = config_get("bupdater_debug_level",cha);
    if (ret != NULL){
	debug=atoi(ret->value);
    }
    
    ret = config_get("bupdater_debug_logfile",cha);
    if (ret != NULL){
	debuglogname=strdup(ret->value);
	if(debuglogname == NULL){
	    sysfatal("strdup failed for debuglogname in main: %r");
	}
    }
    if(debug <=0){
	debug=0;
    }
    
    if(debuglogname){
	if((debuglogfile = fopen(debuglogname, "a+"))==0){
	    debug = 0;
	}
    }else{
	debug = 0;
    }

    ret = config_get("debug_level",cha);
    if (ret != NULL){
	debug=atoi(ret->value);
    }

    ret = config_get("debug_logfile",cha);
    if (ret != NULL){
	debuglogname=strdup(ret->value);
	if(debuglogname == NULL){
	    sysfatal("strdup failed for debuglogname in main: %r");
	}
    }
    if(debug <=0){
	debug=0;
    }

    if(debuglogname){
	if((debuglogfile = fopen(debuglogname, "a+"))==0){
	    debug = 0;
	}
    }else{
	debug = 0;
    }

    ret = config_get("sge_binpath",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key sge_binpath not found\n",argv0);
    } else {
	sge_binpath=strdup(ret->value);
	if(sge_binpath == NULL){
	    sysfatal("strdup failed for sge_binpath in main: %r");
	}
    }

    ret = config_get("sge_rootpath",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key sge_rootpath not found\n",argv0);
    } else {
	sge_rootpath=strdup(ret->value);
	if(sge_rootpath == NULL){
	    sysfatal("strdup failed for sge_rootpath in main: %r");
	}
	
	tpath=make_message("%s",sge_rootpath);
	if (opendir(tpath)==NULL){
	    do_log(debuglogfile, debug, 1, "%s: dir %s does not exist or is not readable\n",argv0,tpath);
	    sysfatal("dir %s does not exist or is not readable: %r",tpath);
	}
	free(tpath);
    }

    ret = config_get("sge_cellname",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key sge_cellname not found\n",argv0);
    } else {
	sge_cellname=strdup(ret->value);
	if(sge_cellname == NULL){
	    sysfatal("strdup failed for sge_cellname in main: %r");
	}
    }

    ret = config_get("sge_rootpath",cha);
    if (ret == NULL){
	if(debug){
	    fprintf(debuglogfile, "%s: key sge_rootpath not found\n",argv0);
	    fflush(debuglogfile);
	}
    } else {
	sge_rootpath=strdup(ret->value);
	if(sge_rootpath == NULL){
	    sysfatal("strdup failed for sge_rootpath in main: %r");
	}
    }

    ret = config_get("sge_cellname",cha);
    if (ret == NULL){
	if(debug){
	    fprintf(debuglogfile, "%s: key sge_cellname not found\n",argv0);
	    fflush(debuglogfile);
	}
    } else {
	sge_cellname=strdup(ret->value);
	if(sge_cellname == NULL){
	    sysfatal("strdup failed for sge_cellname in main: %r");
	}
    }

    ret = config_get("job_registry",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key job_registry not found\n",argv0);
	sysfatal("job_registry not defined. Exiting");
    } else {
	reg_file=strdup(ret->value);
	if(reg_file == NULL){
	    sysfatal("strdup failed for reg_file in main: %r");
	}
    }

    ret = config_get("purge_interval",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key purge_interval not found using the default:%d\n",argv0,purge_interval);
    } else {
	purge_interval=atoi(ret->value);
    }

    ret = config_get("finalstate_query_interval",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key finalstate_query_interval not found using the default:%d\n",argv0,finalstate_query_interval);
    } else {
	finalstate_query_interval=atoi(ret->value);
    }

    ret = config_get("alldone_interval",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key alldone_interval not found using the default:%d\n",argv0,alldone_interval);
    } else {
	alldone_interval=atoi(ret->value);
    }

    ret = config_get("bupdater_loop_interval",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key bupdater_loop_interval not found using the default:%d\n",argv0,loop_interval);
    } else {
	loop_interval=atoi(ret->value);
    }

    ret = config_get("bupdater_pidfile",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key bupdater_pidfile not found\n",argv0);
    } else {
	pidfile=strdup(ret->value);
	if(pidfile == NULL){
	    sysfatal("strdup failed for pidfile in main: %r");
	}
    }

    ret = config_get("job_registry_use_mmap",cha);
    if (ret == NULL){
        do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap not found. Default is NO\n",argv0);
    } else {
        do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap is set to %s\n",argv0,ret->value);
    }

    if( !nodmn ) daemonize();

    if( pidfile ){
	writepid(pidfile);
	free(pidfile);
    }

    config_free(cha);
    rha=job_registry_init(reg_file, BY_BATCH_ID);
    if (rha == NULL){
	do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,reg_file);
	fprintf(stderr,"%s: Error initialising job registry %s :",argv0,reg_file);
	perror("");
    }
   for(;;){
	/* Purge old entries from registry */
	now=time(0);
	if(now - purge_time > 86400){
	    if(job_registry_purge(reg_file, now-purge_interval,0)<0){
		do_log(debuglogfile, debug, 1, "%s: Error purging job registry %s\n",argv0,reg_file);
		fprintf(stderr,"%s: Error purging job registry %s :",argv0,reg_file);
		perror("");
	    }else{
		purge_time=time(0);
	    }
	}
	
	//IntStateQuery();
	fd = job_registry_open(rha, "r");
	if (fd == NULL)
	{
	    do_log(debuglogfile, debug, 1, "%s: Error opening job registry %s\n",argv0,reg_file);
	    fprintf(stderr,"%s: Error opening job registry %s :",argv0,reg_file);
	    perror("");
	    sleep(loop_interval);
	}
	if (job_registry_rdlock(rha, fd) < 0)
	{
	    do_log(debuglogfile, debug, 1, "%s: Error read locking job registry %s\n",argv0,reg_file);
	    fprintf(stderr,"%s: Error read locking job registry %s :",argv0,reg_file);
	    perror("");
	    sleep(loop_interval);
	}
	job_registry_firstrec(rha,fd);
	fseek(fd,0L,SEEK_SET);

	if((query=calloc(STR_CHARS*2,1)) == 0){
	    sysfatal("can't malloc query %r");
	}
	if((queryStates=calloc(STR_CHARS*2,1)) == 0){
	    sysfatal("can't malloc query %r");
	}
	
	query[0]=' ';
	queryStates[0]=' ';
	while ((en = job_registry_get_next(rha, fd)) != NULL)
	{
	    if(((now - en->mdate) > finalstate_query_interval) && en->status!=3 && en->status!=4)
	    {
		/* create the constraint that will be used in condor_history command in FinalStateQuery*/
		snprintf(constraint, sizeof(constraint), " %s",en->batch_id);
		if (en->status==0) snprintf(constraint2, sizeof(constraint2), " u");
		if (en->status==1) snprintf(constraint2, sizeof(constraint2), " q");
		if (en->status==2) snprintf(constraint2, sizeof(constraint2), " r");
		if (en->status==5) snprintf(constraint2, sizeof(constraint2), " h");
		query=realloc(query,strlen(query)+strlen(constraint)+1);
		queryStates=realloc(queryStates,strlen(queryStates)+strlen(constraint2)+1);
		strcat(query,constraint);
		strcat(queryStates,constraint2);
		runfinal=TRUE;
	    }
	    /* Assign Status=4 and ExitStatus=-1 to all entries that after alldone_interval are still not in a final state(3 or 4) */
	    if((now - en->mdate > alldone_interval) && en->status!=3 && en->status!=4 && !runfinal)
	    {
		time_t now;
		now=time(0);
		snprintf(string_now,sizeof(string_now),"%d",now);
		AssignState(en->batch_id,"4" ,"-1","\0","\0",string_now);
	    }
	   free(en);
	}
	if(runfinal){
	    if((query_err=calloc((int)strlen(query),1)) == 0)
		sysfatal("can't malloc query_err %r");
	    FinalStateQuery(query,queryStates,query_err);
	    free(query_err);
	}
	free(query);
	free(queryStates);
	fclose(fd);
	if (runfinal){
	    runfinal=FALSE;
	}
	sleep (loop_interval);
    } //for

    job_registry_destroy(rha);
    return(0);
}
示例#11
0
int
FinalStateQuery(time_t start_date, int logs_to_read)
{

        FILE *fp;
	char *line=NULL;
	char **token;
	char **token_l;
	int maxtok_t=0;
	int maxtok_l=0;
	job_registry_entry en;
	int ret;
	time_t tmstampepoch;
	char *cp=NULL; 
	char *command_string=NULL;
	time_t now;
	char *string_now=NULL;
	job_registry_entry *ren=NULL;

	
	command_string=make_message("%s/sacct -nap -o JobID,JobName,State,ExitCode,submit,start,end 2>/dev/null",slurm_binpath);
	
	fp = popen(command_string,"r");
	
	do_log(debuglogfile, debug, 3, "%s: command_string in FinalStateQuery is:%s\n",argv0,command_string);

	en.status=UNDEFINED;
	JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0");

	if(fp!=NULL){
		while(!feof(fp) && (line=get_line(fp))){
			if(line && strlen(line)==0){
				free(line);
				continue;
			}
			if ((cp = strrchr (line, '\n')) != NULL){
				*cp = '\0';
			}
			en.status=UNDEFINED;
			do_log(debuglogfile, debug, 3, "%s: line in FinalStateQuery is:%s\n",argv0,line);
			now=time(0);
			string_now=make_message("%d",now);
			maxtok_t = strtoken(line, '|', &token);
			JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,token[0]);
			JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
			if(token[2] && strstr(token[2],"COMPLETED")){
				en.status=COMPLETED;
				JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
			}else if(token[2] && strstr(token[2],"CANCELLED")){
				en.status=REMOVED;
				en.exitcode=-999;
				JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
			}else if(token[2] && strstr(token[2],"FAILED")){
				en.status=COMPLETED;
				JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
			}
			
			if(!(token[6] && strstr(token[6],"Unknown"))){
				tmstampepoch=str2epoch(token[6],"N");
				en.udate=tmstampepoch;
			}
			if(en.status==COMPLETED){
				maxtok_l = strtoken(token[3], ':', &token_l);
				en.exitcode=atoi(token_l[0]);
				freetoken(&token_l,maxtok_l);
			}
			
			if ((ren=job_registry_get(rha, en.batch_id)) == NULL){
					fprintf(stderr,"Get of record returns error ");
					perror("");
			}
			if(en.status!=UNDEFINED && en.status!=IDLE && ren && ren->status!=REMOVED && ren->status!=COMPLETED){	
				if ((ret=job_registry_update_select(rha, &en,
				JOB_REGISTRY_UPDATE_UDATE |
				JOB_REGISTRY_UPDATE_STATUS |
				JOB_REGISTRY_UPDATE_UPDATER_INFO |
				JOB_REGISTRY_UPDATE_EXITCODE |
				JOB_REGISTRY_UPDATE_EXITREASON )) < 0){
					if(ret != JOB_REGISTRY_NOT_FOUND){
						fprintf(stderr,"Update of record returns %d: ",ret);
						perror("");
					}
				} else {
					do_log(debuglogfile, debug, 2, "%s: f registry update in FinalStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status);
					if (en.status == REMOVED || en.status == COMPLETED){
						job_registry_unlink_proxy(rha, &en);
					}
					if (remupd_conf != NULL){
						if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){
							do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0);
						}
					}
				}
			}
			free(string_now);
			free(line);
			freetoken(&token,maxtok_t);
			free(ren);
		}
		pclose(fp);
	}
	
	free(command_string);
	return 0;
}
示例#12
0
int
IntStateQuery()
{

        FILE *fp;
	char *line=NULL;
	char **token;
	char **token_l;
	char **token_e;
	int maxtok_t=0;
	int maxtok_l=0;
	int maxtok_e=0;
	job_registry_entry en;
	int ret;
	time_t tmstampepoch;
	char *cp=NULL; 
	char *batch_str=NULL;
	char *command_string=NULL;
	job_registry_entry *ren=NULL;
	int isresumed=FALSE;
	int first=TRUE;
	time_t now;
	char *string_now=NULL;

	command_string=make_message("%s/scontrol -a show jobid",slurm_binpath);
	fp = popen(command_string,"r");

	en.status=UNDEFINED;
	JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0");
	JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0");
	JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0");
	en.exitcode=-1;
	bupdater_free_active_jobs(&bact);

	if(fp!=NULL){
		while(!feof(fp) && (line=get_line(fp))){
			if(line && strlen(line)==0){
				free(line);
				continue;
			}
			if ((cp = strrchr (line, '\n')) != NULL){
				*cp = '\0';
			}
			do_log(debuglogfile, debug, 3, "%s: line in IntStateQuery is:%s\n",argv0,line);
			now=time(0);
			string_now=make_message("%d",now);
			maxtok_t = strtoken(line, ' ', &token);
			if(line && strstr(line,"JobId=")){
				isresumed=FALSE;
				if(!first && en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){	
					if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum,
					JOB_REGISTRY_UPDATE_WN_ADDR|
					JOB_REGISTRY_UPDATE_STATUS|
					JOB_REGISTRY_UPDATE_UDATE|
					JOB_REGISTRY_UPDATE_UPDATER_INFO|
					JOB_REGISTRY_UPDATE_EXITCODE|
					JOB_REGISTRY_UPDATE_EXITREASON)) < 0){
						if(ret != JOB_REGISTRY_NOT_FOUND){
							fprintf(stderr,"Update of record returns %d: ",ret);
							perror("");
						}
					} else {
						if(ret==JOB_REGISTRY_SUCCESS){
							if (en.status == REMOVED || en.status == COMPLETED) {
								do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status,en.exitcode);
								job_registry_unlink_proxy(rha, &en);
							}else{
								do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status);
							}
							if (remupd_conf != NULL){
								if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){
									do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0);
								}
							}
						}
					}
					en.status = UNDEFINED;
					JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0");
					JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0");
					JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0");
					en.exitcode=-1;
				}
				en.status = UNDEFINED;
				maxtok_l = strtoken(token[0], '=', &token_l);
				batch_str=strdup(token_l[1]);
				JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,batch_str);
				JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
				en.exitcode=-1;
				bupdater_push_active_job(&bact, en.batch_id);
				do_log(debuglogfile, debug, 4, "%s: bupdater_push_active_job done for %s\n",argv0,en.batch_id);
				free(batch_str);
				freetoken(&token_l,maxtok_l);
				if(!first) free(ren);
				if ((ren=job_registry_get(rha, en.batch_id)) == NULL){
						fprintf(stderr,"Get of record returns error ");
						perror("");
				}
				if(ren){
					if(strlen(ren->updater_info)>0){
						en.udate=ren->udate;
					}else{
						en.udate=time(0);
					}
				}
				first=FALSE;
				
			}else if(line && strstr(line," JobState=")){
				if(token[0] && strstr(line,"JobState=")){
					maxtok_l = strtoken(token[0], '=', &token_l);
					if(token_l[1] && strstr(token_l[1],"PENDING")){
						en.status=IDLE;
						en.exitcode=-1;
						JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
					}else if(token_l[1] && strstr(token_l[1],"RUNNING")){
						en.status=RUNNING;
						en.exitcode=-1;
						JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
					}else if(token_l[1] && strstr(token_l[1],"COMPLETED")){
						en.status=COMPLETED;
						en.exitcode=0;
						JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
					}else if(token_l[1] && strstr(token_l[1],"CANCELLED")){
						en.status=REMOVED;
						en.exitcode=-999;
						JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
					}else if(token_l[1] && strstr(token_l[1],"FAILED")){
						en.status=COMPLETED;
						en.exitcode=0;
						JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
					}else if(token_l[1] && strstr(token_l[1],"SUSPENDED")){
						en.status=HELD;
						en.exitcode=-1;
						JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now);
					}else if(token_l[1] && strstr(token_l[1],"COMPLETING")){
						bupdater_remove_active_job(&bact, en.batch_id);
					}
					freetoken(&token_l,maxtok_l);
				}
			}else if(line && strstr(line," BatchHost=")){
				if(token[0] && strstr(line,"BatchHost=")){
					maxtok_l = strtoken(token[0], '=', &token_l);
					if(en.status!=IDLE){
						JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,token_l[1]);
					}
					freetoken(&token_l,maxtok_l);
				}
			}else if(line && strstr(line," ExitCode=")){
				if(token[3] && strstr(line,"ExitCode=")){
					maxtok_l = strtoken(token[3], '=', &token_l);
					maxtok_e = strtoken(token_l[1], ':', &token_e);
					if(en.status==COMPLETED){
						en.exitcode=atoi(token_e[0]);
					}
					freetoken(&token_l,maxtok_l);
					freetoken(&token_e,maxtok_e);
				}
			}else if(line && strstr(line," SubmitTime=")){
				if(en.status==IDLE){
					if(token[0] && strstr(line,"SubmitTime=")){
						maxtok_l = strtoken(token[0], '=', &token_l);
						tmstampepoch=str2epoch(token_l[1],"N");
						en.udate=tmstampepoch;
						freetoken(&token_l,maxtok_l);
					}
				}
			}else if(line && strstr(line," StartTime=")){
				if(en.status==RUNNING){
					if(token[0] && strstr(line,"StartTime=")){
						maxtok_l = strtoken(token[0], '=', &token_l);
						tmstampepoch=str2epoch(token_l[1],"N");
						en.udate=tmstampepoch;
						freetoken(&token_l,maxtok_l);
					}
				}
				if(en.status==COMPLETED || en.status==REMOVED){
					if(token[1] && strstr(line,"EndTime=")){
						maxtok_l = strtoken(token[1], '=', &token_l);
						tmstampepoch=str2epoch(token_l[1],"N");
						en.udate=tmstampepoch;
						freetoken(&token_l,maxtok_l);
					}
				}
			}else if(line && strstr(line," SuspendTime=")){
				if(en.status==HELD){
					if(token[1] && strstr(line,"SuspendTime=")){
						maxtok_l = strtoken(token[1], '=', &token_l);
						tmstampepoch=str2epoch(token_l[1],"N");
						en.udate=tmstampepoch;
						freetoken(&token_l,maxtok_l);
					}
				}
			}
			
			free(line);
			free(string_now);
			freetoken(&token,maxtok_t);
		}
		pclose(fp);
	}
		
	if(en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){	
		if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum,
		JOB_REGISTRY_UPDATE_WN_ADDR|
		JOB_REGISTRY_UPDATE_STATUS|
		JOB_REGISTRY_UPDATE_UDATE|
		JOB_REGISTRY_UPDATE_UPDATER_INFO|
		JOB_REGISTRY_UPDATE_EXITCODE|
		JOB_REGISTRY_UPDATE_EXITREASON)) < 0){
			if(ret != JOB_REGISTRY_NOT_FOUND){
				fprintf(stderr,"Update of record returns %d: ",ret);
				perror("");
			}
		} else {
			if(ret==JOB_REGISTRY_SUCCESS){
				if (en.status == REMOVED || en.status == COMPLETED) {
					do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status,en.exitcode);
					job_registry_unlink_proxy(rha, &en);
				}else{
					do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status);
				}
				if (remupd_conf != NULL){
					if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){
						do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0);
					}
				}
			}
		}
	}				

	free(ren);
	free(command_string);
	return 0;
}