int GetJobList(char *buffer, char **joblist_string) { int maxtok; char **tbuf; char *cp=NULL; maxtok=strtoken(buffer,'/',&tbuf); if(tbuf[1]){ *joblist_string = strdup(tbuf[1]); if(*joblist_string == NULL){ sysfatal("strdup failed for joblist_string in GetJobList: %r"); } if ((cp = strrchr(*joblist_string, '\n')) != NULL) { *cp = '\0'; } if ((cp = strrchr(*joblist_string, '\r')) != NULL){ *cp = '\0'; } } freetoken(&tbuf,maxtok); return 0; }
char * ComposeClassad(job_registry_entry *en) { char *strudate=NULL; char *buffer=NULL; char *wn=NULL; char *excode=NULL; char *exreas=NULL; char *blahid=NULL; char *clientid=NULL; int maxtok; char **tbuf; char *cp=NULL; if((buffer=calloc(STR_CHARS,1)) == 0){ sysfatal("can't malloc buffer in PollDB: %r"); } strudate=iepoch2str(en->udate); sprintf(buffer,"[BatchJobId=\"%s\"; JobStatus=%d; ChangeTime=\"%s\";",en->batch_id, en->status, strudate); free(strudate); if (strlen(en->wn_addr) > 0){ wn=make_message(" WorkerNode=\"%s\";",en->wn_addr); strcat(buffer,wn); free(wn); } if (en->status == 3 || en->status == 4){ excode=make_message(" JwExitCode=%d; Reason=\"reason=%d\";", en->exitcode, en->exitcode); strcat(buffer,excode); free(excode); } if (strlen(en->exitreason) > 0){ exreas=make_message(" ExitReason=\"%s\";", en->exitreason); strcat(buffer,exreas); free(exreas); } if (strlen(en->user_prefix) > 0){ maxtok=strtoken(en->user_prefix,'_',&tbuf); if(tbuf[1]){ if ((cp = strrchr (tbuf[1], '\n')) != NULL){ *cp = '\0'; } if ((cp = strrchr (tbuf[1], '\r')) != NULL){ *cp = '\0'; } clientid=make_message(" ClientJobId=\"%s\";",tbuf[1]); } blahid=make_message("%s BlahJobName=\"%s\";",clientid, en->user_prefix); strcat(buffer,blahid); free(blahid); freetoken(&tbuf,maxtok); free(clientid); } strcat(buffer,"]\n"); return buffer; }
int NotifyStart(char *buffer, time_t *lastnotiftime) { int maxtok; char **tbuf; char *cp=NULL; char *notifdate=NULL; maxtok = strtoken(buffer,'/',&tbuf); if(tbuf[1]){ notifdate=strdup(tbuf[1]); if(notifdate == NULL){ sysfatal("strdup failed for notifdate in NotifyStart: %r"); } if ((cp = strrchr (notifdate, '\n')) != NULL){ *cp = '\0'; } if ((cp = strrchr (notifdate, '\r')) != NULL){ *cp = '\0'; } } freetoken(&tbuf,maxtok); *lastnotiftime = str2epoch(notifdate,"S"); free(notifdate); return 0; }
int GetFilter(char *buffer, const int conn_c, char **creamfilter) { int maxtok; char **tbuf; char *cp=NULL; char * out_buf; maxtok = strtoken(buffer,'/',&tbuf); if(tbuf[1]){ *creamfilter = make_message("%s",tbuf[1]); if(*creamfilter == NULL){ sysfatal("strdup failed for creamfilter in GetFilter: %r"); } if ((cp = strrchr (*creamfilter, '\n')) != NULL) { *cp = '\0'; } if ((cp = strrchr (*creamfilter, '\r')) != NULL) { *cp = '\0'; } out_buf = make_message("CREAMFILTER set to %s\n", *creamfilter); } else { out_buf = make_message("CREAMFILTER ERROR\n"); } Writeline(conn_c, out_buf, strlen(out_buf)); do_log(debuglogfile, debug, 1, "Sent Reply for CREAMFILTER command:%s\n",out_buf); freetoken(&tbuf,maxtok); free(out_buf); return 0; }
int AssignState (char *element, char *status, char *exit, char *reason, char *wn, char *udate){ char **id_element; job_registry_entry en; time_t now; char *string_now=NULL; int i=0; int n=strtoken(element, '.', &id_element); int iret; if(id_element[0]){ JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,id_element[0]); en.status=atoi(status); en.exitcode=atoi(exit); JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,wn); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,reason); now=time(0); string_now=make_message("%d",now); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now) en.udate=now; free(string_now); }else{ if((element=calloc(STR_CHARS,1)) == 0){ sysfatal("can't malloc cmd in GetAndSend: %r"); } } if ((iret=job_registry_update(rha, &en)) < 0){ fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id); perror(""); }else{ if (en.status == REMOVED || en.status == COMPLETED){ job_registry_unlink_proxy(rha, &en); } } freetoken(&id_element,n); return 0; }
int FinalStateQuery(char *input_string, int logs_to_read) { /* tracejob -m -l -a <jobid> In line: 04/23/2008 11:50:43 S Exit_status=0 resources_used.cput=00:00:01 resources_used.mem=11372kb resources_used.vmem=52804kb resources_used.walltime=00:10:15 there are: udate for the final state (04/23/2008 11:50:43): exitcode Exit_status= */ /* Filled entries: batch_id (a list of jobid is given, one for each tracejob call) status (always a final state 3 or 4) exitcode udate Filled by submit script: blah_id Unfilled entries: exitreason */ /* [root@cream-12 server_logs]# tracejob -m -l -a 13 Job: 13.cream-12.pd.infn.it 04/23/2008 11:40:27 S enqueuing into cream_1, state 1 hop 1 04/23/2008 11:40:27 S Job Queued at request of [email protected], owner = [email protected], job name = cream_365713239, queue = cream_1 04/23/2008 11:40:28 S Job Modified at request of [email protected] 04/23/2008 11:40:28 S Job Run at request of [email protected] 04/23/2008 11:50:43 S Exit_status=0 resources_used.cput=00:00:01 resources_used.mem=11372kb resources_used.vmem=52804kb resources_used.walltime=00:10:15 04/23/2008 11:50:44 S dequeuing from cream_1, state COMPLETE */ FILE *fp; char *line=NULL; char **token; char **jobid; int maxtok_t=0,maxtok_j=0,k; job_registry_entry en; int ret; char *timestamp; time_t tmstampepoch; char *exit_str=NULL; int failed_count=0; int time_to_add=0; time_t now; char *cp=NULL; char *command_string=NULL; char *pbs_spool=NULL; char *string_now=NULL; int tracejob_line_counter=0; do_log(debuglogfile, debug, 3, "%s: input_string in FinalStateQuery is:%s\n",argv0,input_string); maxtok_j = strtoken(input_string, ':', &jobid); for(k=0;k<maxtok_j;k++){ if(jobid[k] && strlen(jobid[k])==0) continue; pbs_spool=(pbs_spoolpath?make_message("-p %s ",pbs_spoolpath):make_message("")); command_string=make_message("%s%s/tracejob %s-m -l -a -n %d %s",batch_command,pbs_binpath,pbs_spool,logs_to_read,jobid[k]); free(pbs_spool); fp = popen(command_string,"r"); do_log(debuglogfile, debug, 3, "%s: command_string in FinalStateQuery is:%s\n",argv0,command_string); /* en.status is set =0 (UNDEFINED) here and it is tested if it is !=0 before the registry update: the update is done only if en.status is !=0*/ en.status=UNDEFINED; JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,jobid[k]); tracejob_line_counter=0; if(fp!=NULL){ while(!feof(fp) && (line=get_line(fp))){ if(line && strlen(line)==0){ free(line); continue; } if(tracejob_line_counter>tracejob_max_output){ do_log(debuglogfile, debug, 2, "%s: Tracejob output limit of %d lines reached. Skipping command.\n",argv0,tracejob_max_output); free(line); break; } if ((cp = strrchr (line, '\n')) != NULL){ *cp = '\0'; tracejob_line_counter++; } do_log(debuglogfile, debug, 3, "%s: line in FinalStateQuery is:%s\n",argv0,line); now=time(0); string_now=make_message("%d",now); if(line && (strstr(line,"Job deleted") || (strstr(line,"dequeuing from") && strstr(line,"state RUNNING")))){ maxtok_t = strtoken(line, ' ', &token); timestamp=make_message("%s %s",token[0],token[1]); tmstampepoch=str2epoch(timestamp,"A"); free(timestamp); freetoken(&token,maxtok_t); en.udate=tmstampepoch; en.status=REMOVED; en.exitcode=-999; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); }else if(line && strstr(line," Exit_status=") && en.status != REMOVED){ maxtok_t = strtoken(line, ' ', &token); timestamp=make_message("%s %s",token[0],token[1]); tmstampepoch=str2epoch(timestamp,"A"); exit_str=strdup(token[3]); if(exit_str == NULL){ sysfatal("strdup failed for exit_str in FinalStateQuery: %r"); } free(timestamp); freetoken(&token,maxtok_t); if(strstr(exit_str,"Exit_status=")){ maxtok_t = strtoken(exit_str, '=', &token); if(maxtok_t == 2){ en.exitcode=atoi(token[1]); freetoken(&token,maxtok_t); }else{ en.exitcode=-1; } }else{ en.exitcode=-1; } free(exit_str); en.udate=tmstampepoch; en.status=COMPLETED; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); } free(string_now); free(line); } pclose(fp); } if(en.status !=UNDEFINED && en.status!=IDLE){ if ((ret=job_registry_update_select(rha, &en, JOB_REGISTRY_UPDATE_UDATE | JOB_REGISTRY_UPDATE_STATUS | JOB_REGISTRY_UPDATE_UPDATER_INFO | JOB_REGISTRY_UPDATE_EXITCODE | JOB_REGISTRY_UPDATE_EXITREASON )) < 0){ if(ret != JOB_REGISTRY_NOT_FOUND){ fprintf(stderr,"Update of record returns %d: ",ret); perror(""); } } else { do_log(debuglogfile, debug, 2, "%s: registry update in FinalStateQuery for: jobid=%s exitcode=%d status=%d\n",argv0,en.batch_id,en.exitcode,en.status); if (en.status == REMOVED || en.status == COMPLETED){ job_registry_unlink_proxy(rha, &en); } if (remupd_conf != NULL){ if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } } }else{ failed_count++; } free(command_string); } now=time(0); if(failed_count>10){ failed_count=10; } time_to_add=pow(failed_count,1.5); next_finalstatequery=now+time_to_add; do_log(debuglogfile, debug, 3, "%s: next FinalStatequery will be in %d seconds\n",argv0,time_to_add); freetoken(&jobid,maxtok_j); return failed_count; }
int IntStateQuery() { /* qstat -f Job Id: 11.cream-12.pd.infn.it Job_Name = cream_579184706 job_state = R ctime = Wed Apr 23 11:39:55 2008 exec_host = cream-wn-029.pn.pd.infn.it/0 */ /* Filled entries: batch_id wn_addr status udate Filled by submit script: blah_id Unfilled entries: exitreason */ FILE *fp; char *line=NULL; char **token; int maxtok_t=0; job_registry_entry en; int ret; char *timestamp; time_t tmstampepoch; char *batch_str=NULL; char *wn_str=NULL; char *twn_str=NULL; char *status_str=NULL; char *ex_str=NULL; int ex_code=0; char *cp=NULL; char *command_string=NULL; job_registry_entry *ren=NULL; int first=TRUE; time_t now; char *string_now=NULL; command_string=make_message("%s%s/qstat -f",batch_command,pbs_binpath); fp = popen(command_string,"r"); en.status=UNDEFINED; JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0"); en.exitcode=-1; bupdater_free_active_jobs(&bact); if(fp!=NULL){ while(!feof(fp) && (line=get_line(fp))){ if(line && strlen(line)==0){ free(line); continue; } if ((cp = strrchr (line, '\n')) != NULL){ *cp = '\0'; } do_log(debuglogfile, debug, 3, "%s: line in IntStateQuery is:%s\n",argv0,line); now=time(0); string_now=make_message("%d",now); if(line && strstr(line,"Job Id: ")){ if(!first && en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum, JOB_REGISTRY_UPDATE_WN_ADDR| JOB_REGISTRY_UPDATE_STATUS| JOB_REGISTRY_UPDATE_UDATE| JOB_REGISTRY_UPDATE_UPDATER_INFO| JOB_REGISTRY_UPDATE_EXITCODE| JOB_REGISTRY_UPDATE_EXITREASON)) < 0){ if(ret != JOB_REGISTRY_NOT_FOUND){ fprintf(stderr,"Update of record returns %d: ",ret); perror(""); } } else { if(ret==JOB_REGISTRY_SUCCESS){ if (en.status == REMOVED || en.status == COMPLETED) { do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.wn_addr,en.status,en.exitcode); job_registry_unlink_proxy(rha, &en); }else{ do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.wn_addr,en.status); } } if (remupd_conf != NULL){ if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } } en.status = UNDEFINED; JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0"); en.exitcode=-1; } maxtok_t = strtoken(line, ':', &token); batch_str=strdel(token[1]," "); JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,batch_str); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); en.exitcode=-1; bupdater_push_active_job(&bact, en.batch_id); free(batch_str); freetoken(&token,maxtok_t); if(!first) free(ren); if ((ren=job_registry_get(rha, en.batch_id)) == NULL){ fprintf(stderr,"Get of record returns error for %s ",en.batch_id); perror(""); } first=FALSE; }else if(line && strstr(line,"job_state = ")){ maxtok_t = strtoken(line, '=', &token); status_str=strdel(token[1]," "); if(status_str && strcmp(status_str,"Q")==0){ en.status=IDLE; en.exitcode=-1; JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); }else if(status_str && strcmp(status_str,"W")==0){ en.status=IDLE; en.exitcode=-1; }else if(status_str && strcmp(status_str,"R")==0){ en.status=RUNNING; en.exitcode=-1; }else if(status_str && strcmp(status_str,"C")==0){ en.status=COMPLETED; JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); }else if(status_str && strcmp(status_str,"H")==0){ en.status=HELD; en.exitcode=-1; JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); } free(status_str); freetoken(&token,maxtok_t); }else if(line && strstr(line,"unable to run job")){ en.status=IDLE; en.exitcode=-1; }else if(line && strstr(line,"exit_status = ")){ maxtok_t = strtoken(line, '=', &token); ex_str=strdel(token[1]," "); ex_code=atoi(ex_str); if(ex_code==0){ en.exitcode=0; }else if(ex_code==271){ en.status=REMOVED; en.exitcode=-999; }else{ en.exitcode=ex_code; } free(ex_str); freetoken(&token,maxtok_t); }else if(line && strstr(line,"exec_host = ")){ maxtok_t = strtoken(line, '=', &token); twn_str=strdup(token[1]); if(twn_str == NULL){ sysfatal("strdup failed for twn_str in IntStateQuery: %r"); } freetoken(&token,maxtok_t); maxtok_t = strtoken(twn_str, '/', &token); wn_str=strdel(token[0]," "); JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,wn_str); free(twn_str); free(wn_str); freetoken(&token,maxtok_t); }else if(line && strstr(line,"mtime = ")){ maxtok_t = strtoken(line, ' ', &token); timestamp=make_message("%s %s %s %s %s",token[2],token[3],token[4],token[5],token[6]); tmstampepoch=str2epoch(timestamp,"L"); free(timestamp); en.udate=tmstampepoch; freetoken(&token,maxtok_t); } free(line); free(string_now); } pclose(fp); } if(en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum, JOB_REGISTRY_UPDATE_WN_ADDR| JOB_REGISTRY_UPDATE_STATUS| JOB_REGISTRY_UPDATE_UDATE| JOB_REGISTRY_UPDATE_UPDATER_INFO| JOB_REGISTRY_UPDATE_EXITCODE| JOB_REGISTRY_UPDATE_EXITREASON)) < 0){ if(ret != JOB_REGISTRY_NOT_FOUND){ fprintf(stderr,"Update of record returns %d: ",ret); perror(""); } } else { if(ret==JOB_REGISTRY_SUCCESS){ if (en.status == REMOVED || en.status == COMPLETED) { do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.wn_addr,en.status,en.exitcode); job_registry_unlink_proxy(rha, &en); }else{ do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.wn_addr,en.status); } } if (remupd_conf != NULL){ if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } } } free(ren); free(command_string); return 0; }
int PollDB() { FILE *fd; job_registry_entry *en; job_registry_handle *rha; job_registry_handle *rhc; char *buffer=NULL; char *cdate=NULL; time_t now; int maxtok,i,maxtokl,j; char **tbuf; char **lbuf; int len=0,flen=0; struct stat sbuf; int rc; char *regfile; char *cp=NULL; int to_sleep=FALSE; int skip_reg_open=FALSE; int ret; rha=job_registry_init(registry_file, BY_BATCH_ID); if (rha == NULL){ do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,registry_file); fprintf(stderr,"%s: Error initialising job registry %s :",argv0,registry_file); perror(""); } for(;;){ now=time(NULL); to_sleep=TRUE; /* cycle over connections: sleep if startnotify, startnotifyjob and sentendonce are not set. If startnotifyjob is set the conn is served. */ for(i=0; i<MAX_CONNECTIONS; i++){ if(!connections[i].startnotify && !connections[i].startnotifyjob && !(connections[i].firstnotify && connections[i].sentendonce)) continue; if(connections[i].startnotify) to_sleep=FALSE; if(connections[i].startnotifyjob){ to_sleep=FALSE; rhc=job_registry_init(registry_file, BY_USER_PREFIX); if (rhc == NULL){ do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,registry_file); fprintf(stderr,"%s: Error initialising job registry %s :",argv0,registry_file); perror(""); } do_log(debuglogfile, debug, 2, "%s:Job list for notification:%s\n",argv0,connections[i].joblist_string); maxtok=strtoken(connections[i].joblist_string,',',&tbuf); for(j=0;j<maxtok;j++){ if ((en=job_registry_get(rhc, tbuf[j])) != NULL){ buffer=ComposeClassad(en); }else{ if(remupd_conf == NULL){ cdate=iepoch2str(now); maxtokl=strtoken(tbuf[j],'_',&lbuf); if(lbuf[1]){ if ((cp = strrchr (lbuf[1], '\n')) != NULL){ *cp = '\0'; } if ((cp = strrchr (lbuf[1], '\r')) != NULL){ *cp = '\0'; } buffer=make_message("[BlahJobName=\"%s\"; ClientJobId=\"%s\"; JobStatus=4; JwExitCode=999; ExitReason=\"BUpdater is not able to find the job anymore\"; Reason=\"BUpdater is not able to find the job anymore\"; ChangeTime=\"%s\"; ]\n",tbuf[j],lbuf[1],cdate); } freetoken(&lbuf,maxtokl); free(cdate); }else{ maxtokl=strtoken(tbuf[j],':',&lbuf); JOB_REGISTRY_ASSIGN_ENTRY(en->batch_id,lbuf[0]); JOB_REGISTRY_ASSIGN_ENTRY(en->blah_id,lbuf[1]); freetoken(&lbuf,maxtokl); en->status = 0; if ((ret=job_registry_append(rhc, en))<0){ if(ret != JOB_REGISTRY_NOT_FOUND){ fprintf(stderr,"Update of record returns %d: ",ret); perror(""); } }else{ if(ret==JOB_REGISTRY_SUCCESS){ do_log(debuglogfile, debug, 2, "%s: registry append in PollDB for: jobid=%s blahjobid=%s\n",argv0,en->batch_id,en->blah_id); } } } } free(en); len=strlen(buffer); if(connections[i].finalbuffer != NULL){ flen=strlen(connections[i].finalbuffer); }else{ flen=0; } connections[i].finalbuffer = realloc(connections[i].finalbuffer,flen+len+2); if (connections[i].finalbuffer == NULL){ sysfatal("can't realloc finalbuffer in PollDB: %r"); } if(flen==0){ connections[i].finalbuffer[0]='\000'; } strcat(connections[i].finalbuffer,buffer); free(buffer); } freetoken(&tbuf,maxtok); if(connections[i].finalbuffer != NULL){ if(NotifyCream(connections[i].finalbuffer,&connections[i])!=-1){ /* change last notification time */ connections[i].lastnotiftime=now; connections[i].startnotifyjob=FALSE; } free(connections[i].finalbuffer); connections[i].finalbuffer=NULL; } job_registry_destroy(rhc); } if(connections[i].firstnotify && connections[i].sentendonce){ to_sleep=FALSE; if(NotifyCream("NTFDATE/END\n",&connections[i])!=-1){ connections[i].startnotify=TRUE; connections[i].sentendonce=FALSE; connections[i].firstnotify=FALSE; connections[i].startnotifyjob=FALSE; } } } if(to_sleep){ sleep(loop_interval); continue; } regfile=make_message("%s/registry",registry_file); rc=stat(regfile,&sbuf); free(regfile); skip_reg_open=TRUE; for(i=0; i<MAX_CONNECTIONS; i++){ if(sbuf.st_mtime>=connections[i].lastnotiftime){ skip_reg_open=FALSE; break; } } if(skip_reg_open){ do_log(debuglogfile, debug, 3, "Skip registry opening: mtime:%d lastn:%d\n",sbuf.st_mtime,connections[i].lastnotiftime); sleep(loop_interval); continue; } do_log(debuglogfile, debug, 3, "Normal registry opening\n"); fd = job_registry_open(rha, "r"); if (fd == NULL) { do_log(debuglogfile, debug, 1, "%s: Error opening job registry %s\n",argv0,registry_file); fprintf(stderr,"%s: Error opening job registry %s :",argv0,registry_file); perror(""); sleep(loop_interval); continue; } if (job_registry_rdlock(rha, fd) < 0) { do_log(debuglogfile, debug, 1, "%s: Error read locking registry %s\n",argv0,registry_file); fprintf(stderr,"%s: Error read locking registry %s :",argv0,registry_file); perror(""); sleep(loop_interval); continue; } while ((en = job_registry_get_next(rha, fd)) != NULL) { for(i=0; i<MAX_CONNECTIONS; i++){ if(connections[i].creamfilter==NULL) continue; if(en->mdate >= connections[i].lastnotiftime && en->mdate < now && en->user_prefix && strstr(en->user_prefix,connections[i].creamfilter)!=NULL && strlen(en->updater_info)>0) { buffer=ComposeClassad(en); len=strlen(buffer); if(connections[i].finalbuffer != NULL){ flen=strlen(connections[i].finalbuffer); }else{ flen=0; } connections[i].finalbuffer = realloc(connections[i].finalbuffer,flen+len+2); if (connections[i].finalbuffer == NULL){ sysfatal("can't realloc finalbuffer in PollDB: %r"); } if(flen==0){ connections[i].finalbuffer[0]='\000'; } strcat(connections[i].finalbuffer,buffer); free(buffer); } } free(en); } for(i=0; i<MAX_CONNECTIONS; i++){ if(connections[i].finalbuffer != NULL){ if(NotifyCream(connections[i].finalbuffer,&connections[i])!=-1){ /* change last notification time */ connections[i].lastnotiftime=now; } free(connections[i].finalbuffer); connections[i].finalbuffer=NULL; } } fclose(fd); sleep(loop_interval); } job_registry_destroy(rha); return 0; }
int FinalStateQuery(char *query,char *queryStates, char *query_err){ char line[STR_CHARS],fail[6],qExit[10],qFailed[10],qHostname[100],qStatus[2],command_string[100]; char **saveptr1,**saveptr2,**list_query,**list_queryStates; FILE *file_output; int numQuery=0,numQueryStates=0,j=0,l=0,cont=0,cont2=0, nq=0; time_t now; char string_now[11]; job_registry_entry en; int iret; numQuery=strtoken(query,' ',&list_query); nq=numQuery; numQueryStates=strtoken(queryStates,' ',&list_queryStates); if (numQuery!=numQueryStates) return 1; sprintf(command_string,"%s/qstat -u '*'",sge_binpath); if (debug) do_log(debuglogfile, debug, 1, "+-+line 433, command_string:%s\n",command_string); //load in qstatJob list of jobids from qstat command exec file_output = popen(command_string,"r"); if (file_output == NULL) return 0; while (fgets(line,sizeof(line), file_output) != NULL){ cont=strtoken(line, ' ', &saveptr1); if ((strcmp(saveptr1[0],"job-ID")!=0)&&(strncmp(saveptr1[0],"-",1)!=0)){ for (l=0;l<nq;l++){ if (strcmp(list_query[l],saveptr1[0])==0){ if (strcmp(list_queryStates[l],saveptr1[4])!=0){ now=time(0); sprintf(string_now,"%d",now); if (strcmp(saveptr1[4],"u")==0){ JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]); en.status=0; en.exitcode=0; JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,""); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"0"); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now) en.udate=now; if ((iret=job_registry_update(rha, &en)) < 0){ fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id); perror(""); } } if (strcmp(saveptr1[4],"q")==0){ JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]); en.status=1; en.exitcode=0; JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,""); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"0"); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now) en.udate=now; if ((iret=job_registry_update(rha, &en)) < 0){ fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id); perror(""); } } if (strcmp(saveptr1[4],"r")==0){ cont2=strtoken(saveptr1[7], '@', &saveptr2); JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]); en.status=2; en.exitcode=0; JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,saveptr2[1]); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"0"); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now) en.udate=now; if ((iret=job_registry_update(rha, &en)) < 0){ fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id); perror(""); } freetoken(&saveptr2,cont2); } if ((strcmp(saveptr1[4],"hr")==0)||strcmp(saveptr1[4],"hqw")==0){ JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]); en.status=5; en.exitcode=0; JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,""); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"0"); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now) en.udate=now; if ((iret=job_registry_update(rha, &en)) < 0){ fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id); perror(""); } } } //i must put out element from query for (j=l;j<nq;j++) if (list_query[j+1]!=NULL) strcpy(list_query[j],list_query[j+1]); for (j=l;j<nq;j++) if (list_queryStates[j+1]!=NULL) strcpy(list_queryStates[j],list_queryStates[j+1]); nq--; break; } } } line[0]='\0'; freetoken(&saveptr1,cont); } pclose( file_output ); sprintf(query_err,"\0"); //now we have check in list_query only states that not change status //because they're not in qstat result for (l=0; l<nq; l++){ sprintf(command_string,"%s/qacct -j '%s'",sge_binpath,list_query[l]); if (debug) do_log(debuglogfile, debug, 1, "+-+line 520,command_string:%s\n",command_string); file_output = popen(command_string,"r"); if (file_output == NULL) return 1; //if a job number is here means that job was in query previously and //if now it's not in query and not finished (NULL qstat) it was deleted //or it's on transition time if (fgets( line,sizeof(line), file_output )==NULL){ strcat(query_err,list_query[l]); strcat(query_err," "); pclose( file_output ); continue; } //there is no problem to lost first line with previous fgets, because //it's only a line of ============================================= while (fgets( line,sizeof(line), file_output )!=NULL){ cont=strtoken(line, ' ', &saveptr1); if (strcmp(saveptr1[0],"hostname")==0) strcpy(qHostname,saveptr1[1]);; if (strcmp(saveptr1[0],"failed")==0) strcpy(qFailed,saveptr1[1]); if (strcmp(saveptr1[0],"exit_status")==0) strcpy(qExit,saveptr1[1]); freetoken(&saveptr1,cont); } pclose( file_output ); now=time(0); sprintf(string_now,"%d",now); if ((strcmp(qExit,"137")==0)||(strcmp(qExit,"143")==0)){ JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]); en.status=3; en.exitcode=atoi(qExit); JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,qHostname); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,""); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now) en.udate=now; if ((iret=job_registry_update(rha, &en)) < 0){ fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id); perror(""); }else job_registry_unlink_proxy(rha, &en); }else{ JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,list_query[l]); en.status=4; en.exitcode=atoi(qExit); JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,qHostname); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,qFailed); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now) en.udate=now; if ((iret=job_registry_update(rha, &en)) < 0){ fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id); perror(""); }else job_registry_unlink_proxy(rha, &en); } } freetoken(&list_query,numQuery); freetoken(&list_queryStates,numQueryStates); if (debug) do_log(debuglogfile, debug, 1, "+-+query_err:%s\n",query_err); //now check acumulated error jobids to verify if they are an error or not if (strcmp(query_err,"\0")!=0){ sleep(60); cont=0; int n=0; char cmd[10]="\0"; cont=strtoken(query_err, ' ', &list_query); while (n < cont){ if(list_query[n]) strcpy(cmd,list_query[n]); else return 1; sprintf(command_string,"%s/qacct -j '%s'",sge_binpath,cmd); if (debug) do_log(debuglogfile, debug, 1, "+-+line 587 error, command_string:%s\n",command_string); file_output = popen(command_string,"r"); if (file_output == NULL) return 1; //if a job number is here means that job was in query previously and //if now it's not in query and not finished (NULL qstat) it was deleted if (fgets( line,sizeof(line), file_output )==NULL){ JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,cmd); en.status=3; en.exitcode=3; JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,""); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"reason=3"); now=time(0); sprintf(string_now,"%d",now); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now) en.udate=now; if ((iret=job_registry_update(rha, &en)) < 0){ fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id); perror(""); }else job_registry_unlink_proxy(rha, &en); pclose( file_output ); n++; continue; } //there is no problem to lost first line with previous fgets, because //it's only a line of ============================================= while (fgets( line,sizeof(line), file_output )!=NULL){ cont=strtoken(line, ' ', &saveptr1); if (strcmp(saveptr1[0],"hostname")==0) strcpy(qHostname,saveptr1[1]); if (strcmp(saveptr1[0],"failed")==0) strcpy(qFailed,saveptr1[1]); if (strcmp(saveptr1[0],"exit_status")==0) strcpy(qExit,saveptr1[1]); freetoken(&saveptr1,cont); } pclose( file_output ); now=time(0); sprintf(string_now,"%d",now); if ((strcmp(qExit,"137")==0)||(strcmp(qExit,"143")==0)){ JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,cmd); en.status=3; en.exitcode=atoi(qExit); JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,qHostname); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,""); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now) en.udate=now; if ((iret=job_registry_update(rha, &en)) < 0){ fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id); perror(""); }else job_registry_unlink_proxy(rha, &en); }else{ JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,cmd); en.status=4; en.exitcode=atoi(qExit); JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,qHostname); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,qFailed); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now) en.udate=now; if ((iret=job_registry_update(rha, &en)) < 0){ fprintf(stderr,"Update of record returns %d: \nJobId: %d", iret,en.batch_id); perror(""); }else job_registry_unlink_proxy(rha, &en); } n++; } freetoken(&list_query,cont); } return 0; }
int main(int argc, char *argv[]){ FILE *fd; job_registry_entry *en; time_t now; time_t purge_time=0; char constraint[JOBID_MAX_LEN+1]; char constraint2[5]; char *query=NULL; char *queryStates=NULL; char *query_err=NULL; char *pidfile=NULL; char string_now[11]; char *tpath; int version=0; int tmptim; int finstr_len=0; int loop_interval=DEFAULT_LOOP_INTERVAL; int fsq_ret=0; int c; int confirm_time=0; static int help; static int short_help; while (1) { static struct option long_options[] = { {"help", no_argument, &help, 1}, {"usage", no_argument, &short_help, 1}, {"nodaemon", no_argument, 0, 'o'}, {"version", no_argument, 0, 'v'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "vo",long_options, &option_index); if (c == -1){ break; } switch (c) { case 0: if (long_options[option_index].flag != 0){ break; } case 'v': version=1; break; case 'o': nodmn=1; break; case '?': break; default: abort (); } } //check if another instance is running char **ptr; char out[3]; fgets(out, sizeof(out),popen("ps -d | grep -c BUpdaterSGE","r")); strtoken(out,'\n',&ptr); if (strcmp(ptr[0],"1")!=0){ fprintf(stderr,"There is another instance of BUpdaterSGE running.\nExiting ...\n"); return -1; } freetoken(&ptr,1); if(help){ usage(); } if(short_help){ short_usage(); } argv0 = argv[0]; signal(SIGHUP,sighup); if(version) { printf("%s Version: %s\n",progname,VERSION); exit(EXIT_SUCCESS); } /* Checking configuration */ check_config_file("UPDATER"); cha = config_read(NULL); if (cha == NULL) { fprintf(stderr,"Error reading config: "); perror(""); return -1; } config_setenv(NULL); ret = config_get("bupdater_child_poll_timeout",cha); if (ret != NULL){ tmptim=atoi(ret->value); if (tmptim > 0) bfunctions_poll_timeout = tmptim*1000; } ret = config_get("bupdater_debug_level",cha); if (ret != NULL){ debug=atoi(ret->value); } ret = config_get("bupdater_debug_logfile",cha); if (ret != NULL){ debuglogname=strdup(ret->value); if(debuglogname == NULL){ sysfatal("strdup failed for debuglogname in main: %r"); } } if(debug <=0){ debug=0; } if(debuglogname){ if((debuglogfile = fopen(debuglogname, "a+"))==0){ debug = 0; } }else{ debug = 0; } ret = config_get("debug_level",cha); if (ret != NULL){ debug=atoi(ret->value); } ret = config_get("debug_logfile",cha); if (ret != NULL){ debuglogname=strdup(ret->value); if(debuglogname == NULL){ sysfatal("strdup failed for debuglogname in main: %r"); } } if(debug <=0){ debug=0; } if(debuglogname){ if((debuglogfile = fopen(debuglogname, "a+"))==0){ debug = 0; } }else{ debug = 0; } ret = config_get("sge_binpath",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key sge_binpath not found\n",argv0); } else { sge_binpath=strdup(ret->value); if(sge_binpath == NULL){ sysfatal("strdup failed for sge_binpath in main: %r"); } } ret = config_get("sge_rootpath",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key sge_rootpath not found\n",argv0); } else { sge_rootpath=strdup(ret->value); if(sge_rootpath == NULL){ sysfatal("strdup failed for sge_rootpath in main: %r"); } tpath=make_message("%s",sge_rootpath); if (opendir(tpath)==NULL){ do_log(debuglogfile, debug, 1, "%s: dir %s does not exist or is not readable\n",argv0,tpath); sysfatal("dir %s does not exist or is not readable: %r",tpath); } free(tpath); } ret = config_get("sge_cellname",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key sge_cellname not found\n",argv0); } else { sge_cellname=strdup(ret->value); if(sge_cellname == NULL){ sysfatal("strdup failed for sge_cellname in main: %r"); } } ret = config_get("sge_rootpath",cha); if (ret == NULL){ if(debug){ fprintf(debuglogfile, "%s: key sge_rootpath not found\n",argv0); fflush(debuglogfile); } } else { sge_rootpath=strdup(ret->value); if(sge_rootpath == NULL){ sysfatal("strdup failed for sge_rootpath in main: %r"); } } ret = config_get("sge_cellname",cha); if (ret == NULL){ if(debug){ fprintf(debuglogfile, "%s: key sge_cellname not found\n",argv0); fflush(debuglogfile); } } else { sge_cellname=strdup(ret->value); if(sge_cellname == NULL){ sysfatal("strdup failed for sge_cellname in main: %r"); } } ret = config_get("job_registry",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key job_registry not found\n",argv0); sysfatal("job_registry not defined. Exiting"); } else { reg_file=strdup(ret->value); if(reg_file == NULL){ sysfatal("strdup failed for reg_file in main: %r"); } } ret = config_get("purge_interval",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key purge_interval not found using the default:%d\n",argv0,purge_interval); } else { purge_interval=atoi(ret->value); } ret = config_get("finalstate_query_interval",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key finalstate_query_interval not found using the default:%d\n",argv0,finalstate_query_interval); } else { finalstate_query_interval=atoi(ret->value); } ret = config_get("alldone_interval",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key alldone_interval not found using the default:%d\n",argv0,alldone_interval); } else { alldone_interval=atoi(ret->value); } ret = config_get("bupdater_loop_interval",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key bupdater_loop_interval not found using the default:%d\n",argv0,loop_interval); } else { loop_interval=atoi(ret->value); } ret = config_get("bupdater_pidfile",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key bupdater_pidfile not found\n",argv0); } else { pidfile=strdup(ret->value); if(pidfile == NULL){ sysfatal("strdup failed for pidfile in main: %r"); } } ret = config_get("job_registry_use_mmap",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap not found. Default is NO\n",argv0); } else { do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap is set to %s\n",argv0,ret->value); } if( !nodmn ) daemonize(); if( pidfile ){ writepid(pidfile); free(pidfile); } config_free(cha); rha=job_registry_init(reg_file, BY_BATCH_ID); if (rha == NULL){ do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,reg_file); fprintf(stderr,"%s: Error initialising job registry %s :",argv0,reg_file); perror(""); } for(;;){ /* Purge old entries from registry */ now=time(0); if(now - purge_time > 86400){ if(job_registry_purge(reg_file, now-purge_interval,0)<0){ do_log(debuglogfile, debug, 1, "%s: Error purging job registry %s\n",argv0,reg_file); fprintf(stderr,"%s: Error purging job registry %s :",argv0,reg_file); perror(""); }else{ purge_time=time(0); } } //IntStateQuery(); fd = job_registry_open(rha, "r"); if (fd == NULL) { do_log(debuglogfile, debug, 1, "%s: Error opening job registry %s\n",argv0,reg_file); fprintf(stderr,"%s: Error opening job registry %s :",argv0,reg_file); perror(""); sleep(loop_interval); } if (job_registry_rdlock(rha, fd) < 0) { do_log(debuglogfile, debug, 1, "%s: Error read locking job registry %s\n",argv0,reg_file); fprintf(stderr,"%s: Error read locking job registry %s :",argv0,reg_file); perror(""); sleep(loop_interval); } job_registry_firstrec(rha,fd); fseek(fd,0L,SEEK_SET); if((query=calloc(STR_CHARS*2,1)) == 0){ sysfatal("can't malloc query %r"); } if((queryStates=calloc(STR_CHARS*2,1)) == 0){ sysfatal("can't malloc query %r"); } query[0]=' '; queryStates[0]=' '; while ((en = job_registry_get_next(rha, fd)) != NULL) { if(((now - en->mdate) > finalstate_query_interval) && en->status!=3 && en->status!=4) { /* create the constraint that will be used in condor_history command in FinalStateQuery*/ snprintf(constraint, sizeof(constraint), " %s",en->batch_id); if (en->status==0) snprintf(constraint2, sizeof(constraint2), " u"); if (en->status==1) snprintf(constraint2, sizeof(constraint2), " q"); if (en->status==2) snprintf(constraint2, sizeof(constraint2), " r"); if (en->status==5) snprintf(constraint2, sizeof(constraint2), " h"); query=realloc(query,strlen(query)+strlen(constraint)+1); queryStates=realloc(queryStates,strlen(queryStates)+strlen(constraint2)+1); strcat(query,constraint); strcat(queryStates,constraint2); runfinal=TRUE; } /* Assign Status=4 and ExitStatus=-1 to all entries that after alldone_interval are still not in a final state(3 or 4) */ if((now - en->mdate > alldone_interval) && en->status!=3 && en->status!=4 && !runfinal) { time_t now; now=time(0); snprintf(string_now,sizeof(string_now),"%d",now); AssignState(en->batch_id,"4" ,"-1","\0","\0",string_now); } free(en); } if(runfinal){ if((query_err=calloc((int)strlen(query),1)) == 0) sysfatal("can't malloc query_err %r"); FinalStateQuery(query,queryStates,query_err); free(query_err); } free(query); free(queryStates); fclose(fd); if (runfinal){ runfinal=FALSE; } sleep (loop_interval); } //for job_registry_destroy(rha); return(0); }
int FinalStateQuery(time_t start_date, int logs_to_read) { FILE *fp; char *line=NULL; char **token; char **token_l; int maxtok_t=0; int maxtok_l=0; job_registry_entry en; int ret; time_t tmstampepoch; char *cp=NULL; char *command_string=NULL; time_t now; char *string_now=NULL; job_registry_entry *ren=NULL; command_string=make_message("%s/sacct -nap -o JobID,JobName,State,ExitCode,submit,start,end 2>/dev/null",slurm_binpath); fp = popen(command_string,"r"); do_log(debuglogfile, debug, 3, "%s: command_string in FinalStateQuery is:%s\n",argv0,command_string); en.status=UNDEFINED; JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); if(fp!=NULL){ while(!feof(fp) && (line=get_line(fp))){ if(line && strlen(line)==0){ free(line); continue; } if ((cp = strrchr (line, '\n')) != NULL){ *cp = '\0'; } en.status=UNDEFINED; do_log(debuglogfile, debug, 3, "%s: line in FinalStateQuery is:%s\n",argv0,line); now=time(0); string_now=make_message("%d",now); maxtok_t = strtoken(line, '|', &token); JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,token[0]); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); if(token[2] && strstr(token[2],"COMPLETED")){ en.status=COMPLETED; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); }else if(token[2] && strstr(token[2],"CANCELLED")){ en.status=REMOVED; en.exitcode=-999; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); }else if(token[2] && strstr(token[2],"FAILED")){ en.status=COMPLETED; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); } if(!(token[6] && strstr(token[6],"Unknown"))){ tmstampepoch=str2epoch(token[6],"N"); en.udate=tmstampepoch; } if(en.status==COMPLETED){ maxtok_l = strtoken(token[3], ':', &token_l); en.exitcode=atoi(token_l[0]); freetoken(&token_l,maxtok_l); } if ((ren=job_registry_get(rha, en.batch_id)) == NULL){ fprintf(stderr,"Get of record returns error "); perror(""); } if(en.status!=UNDEFINED && en.status!=IDLE && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ if ((ret=job_registry_update_select(rha, &en, JOB_REGISTRY_UPDATE_UDATE | JOB_REGISTRY_UPDATE_STATUS | JOB_REGISTRY_UPDATE_UPDATER_INFO | JOB_REGISTRY_UPDATE_EXITCODE | JOB_REGISTRY_UPDATE_EXITREASON )) < 0){ if(ret != JOB_REGISTRY_NOT_FOUND){ fprintf(stderr,"Update of record returns %d: ",ret); perror(""); } } else { do_log(debuglogfile, debug, 2, "%s: f registry update in FinalStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); if (en.status == REMOVED || en.status == COMPLETED){ job_registry_unlink_proxy(rha, &en); } if (remupd_conf != NULL){ if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } } } free(string_now); free(line); freetoken(&token,maxtok_t); free(ren); } pclose(fp); } free(command_string); return 0; }
int IntStateQuery() { FILE *fp; char *line=NULL; char **token; char **token_l; char **token_e; int maxtok_t=0; int maxtok_l=0; int maxtok_e=0; job_registry_entry en; int ret; time_t tmstampepoch; char *cp=NULL; char *batch_str=NULL; char *command_string=NULL; job_registry_entry *ren=NULL; int isresumed=FALSE; int first=TRUE; time_t now; char *string_now=NULL; command_string=make_message("%s/scontrol -a show jobid",slurm_binpath); fp = popen(command_string,"r"); en.status=UNDEFINED; JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0"); en.exitcode=-1; bupdater_free_active_jobs(&bact); if(fp!=NULL){ while(!feof(fp) && (line=get_line(fp))){ if(line && strlen(line)==0){ free(line); continue; } if ((cp = strrchr (line, '\n')) != NULL){ *cp = '\0'; } do_log(debuglogfile, debug, 3, "%s: line in IntStateQuery is:%s\n",argv0,line); now=time(0); string_now=make_message("%d",now); maxtok_t = strtoken(line, ' ', &token); if(line && strstr(line,"JobId=")){ isresumed=FALSE; if(!first && en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum, JOB_REGISTRY_UPDATE_WN_ADDR| JOB_REGISTRY_UPDATE_STATUS| JOB_REGISTRY_UPDATE_UDATE| JOB_REGISTRY_UPDATE_UPDATER_INFO| JOB_REGISTRY_UPDATE_EXITCODE| JOB_REGISTRY_UPDATE_EXITREASON)) < 0){ if(ret != JOB_REGISTRY_NOT_FOUND){ fprintf(stderr,"Update of record returns %d: ",ret); perror(""); } } else { if(ret==JOB_REGISTRY_SUCCESS){ if (en.status == REMOVED || en.status == COMPLETED) { do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status,en.exitcode); job_registry_unlink_proxy(rha, &en); }else{ do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } } } en.status = UNDEFINED; JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); en.exitcode=-1; } en.status = UNDEFINED; maxtok_l = strtoken(token[0], '=', &token_l); batch_str=strdup(token_l[1]); JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,batch_str); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); en.exitcode=-1; bupdater_push_active_job(&bact, en.batch_id); do_log(debuglogfile, debug, 4, "%s: bupdater_push_active_job done for %s\n",argv0,en.batch_id); free(batch_str); freetoken(&token_l,maxtok_l); if(!first) free(ren); if ((ren=job_registry_get(rha, en.batch_id)) == NULL){ fprintf(stderr,"Get of record returns error "); perror(""); } if(ren){ if(strlen(ren->updater_info)>0){ en.udate=ren->udate; }else{ en.udate=time(0); } } first=FALSE; }else if(line && strstr(line," JobState=")){ if(token[0] && strstr(line,"JobState=")){ maxtok_l = strtoken(token[0], '=', &token_l); if(token_l[1] && strstr(token_l[1],"PENDING")){ en.status=IDLE; en.exitcode=-1; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); }else if(token_l[1] && strstr(token_l[1],"RUNNING")){ en.status=RUNNING; en.exitcode=-1; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); }else if(token_l[1] && strstr(token_l[1],"COMPLETED")){ en.status=COMPLETED; en.exitcode=0; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); }else if(token_l[1] && strstr(token_l[1],"CANCELLED")){ en.status=REMOVED; en.exitcode=-999; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); }else if(token_l[1] && strstr(token_l[1],"FAILED")){ en.status=COMPLETED; en.exitcode=0; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); }else if(token_l[1] && strstr(token_l[1],"SUSPENDED")){ en.status=HELD; en.exitcode=-1; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); }else if(token_l[1] && strstr(token_l[1],"COMPLETING")){ bupdater_remove_active_job(&bact, en.batch_id); } freetoken(&token_l,maxtok_l); } }else if(line && strstr(line," BatchHost=")){ if(token[0] && strstr(line,"BatchHost=")){ maxtok_l = strtoken(token[0], '=', &token_l); if(en.status!=IDLE){ JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,token_l[1]); } freetoken(&token_l,maxtok_l); } }else if(line && strstr(line," ExitCode=")){ if(token[3] && strstr(line,"ExitCode=")){ maxtok_l = strtoken(token[3], '=', &token_l); maxtok_e = strtoken(token_l[1], ':', &token_e); if(en.status==COMPLETED){ en.exitcode=atoi(token_e[0]); } freetoken(&token_l,maxtok_l); freetoken(&token_e,maxtok_e); } }else if(line && strstr(line," SubmitTime=")){ if(en.status==IDLE){ if(token[0] && strstr(line,"SubmitTime=")){ maxtok_l = strtoken(token[0], '=', &token_l); tmstampepoch=str2epoch(token_l[1],"N"); en.udate=tmstampepoch; freetoken(&token_l,maxtok_l); } } }else if(line && strstr(line," StartTime=")){ if(en.status==RUNNING){ if(token[0] && strstr(line,"StartTime=")){ maxtok_l = strtoken(token[0], '=', &token_l); tmstampepoch=str2epoch(token_l[1],"N"); en.udate=tmstampepoch; freetoken(&token_l,maxtok_l); } } if(en.status==COMPLETED || en.status==REMOVED){ if(token[1] && strstr(line,"EndTime=")){ maxtok_l = strtoken(token[1], '=', &token_l); tmstampepoch=str2epoch(token_l[1],"N"); en.udate=tmstampepoch; freetoken(&token_l,maxtok_l); } } }else if(line && strstr(line," SuspendTime=")){ if(en.status==HELD){ if(token[1] && strstr(line,"SuspendTime=")){ maxtok_l = strtoken(token[1], '=', &token_l); tmstampepoch=str2epoch(token_l[1],"N"); en.udate=tmstampepoch; freetoken(&token_l,maxtok_l); } } } free(line); free(string_now); freetoken(&token,maxtok_t); } pclose(fp); } if(en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum, JOB_REGISTRY_UPDATE_WN_ADDR| JOB_REGISTRY_UPDATE_STATUS| JOB_REGISTRY_UPDATE_UDATE| JOB_REGISTRY_UPDATE_UPDATER_INFO| JOB_REGISTRY_UPDATE_EXITCODE| JOB_REGISTRY_UPDATE_EXITREASON)) < 0){ if(ret != JOB_REGISTRY_NOT_FOUND){ fprintf(stderr,"Update of record returns %d: ",ret); perror(""); } } else { if(ret==JOB_REGISTRY_SUCCESS){ if (en.status == REMOVED || en.status == COMPLETED) { do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status,en.exitcode); job_registry_unlink_proxy(rha, &en); }else{ do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } } } } free(ren); free(command_string); return 0; }