int FinalStateQuery(char *input_string, int logs_to_read) { /* tracejob -m -l -a <jobid> In line: 04/23/2008 11:50:43 S Exit_status=0 resources_used.cput=00:00:01 resources_used.mem=11372kb resources_used.vmem=52804kb resources_used.walltime=00:10:15 there are: udate for the final state (04/23/2008 11:50:43): exitcode Exit_status= */ /* Filled entries: batch_id (a list of jobid is given, one for each tracejob call) status (always a final state 3 or 4) exitcode udate Filled by submit script: blah_id Unfilled entries: exitreason */ /* [root@cream-12 server_logs]# tracejob -m -l -a 13 Job: 13.cream-12.pd.infn.it 04/23/2008 11:40:27 S enqueuing into cream_1, state 1 hop 1 04/23/2008 11:40:27 S Job Queued at request of [email protected], owner = [email protected], job name = cream_365713239, queue = cream_1 04/23/2008 11:40:28 S Job Modified at request of [email protected] 04/23/2008 11:40:28 S Job Run at request of [email protected] 04/23/2008 11:50:43 S Exit_status=0 resources_used.cput=00:00:01 resources_used.mem=11372kb resources_used.vmem=52804kb resources_used.walltime=00:10:15 04/23/2008 11:50:44 S dequeuing from cream_1, state COMPLETE */ FILE *fp; char *line=NULL; char **token; char **jobid; int maxtok_t=0,maxtok_j=0,k; job_registry_entry en; int ret; char *timestamp; time_t tmstampepoch; char *exit_str=NULL; int failed_count=0; int time_to_add=0; time_t now; char *cp=NULL; char *command_string=NULL; char *pbs_spool=NULL; char *string_now=NULL; int tracejob_line_counter=0; do_log(debuglogfile, debug, 3, "%s: input_string in FinalStateQuery is:%s\n",argv0,input_string); maxtok_j = strtoken(input_string, ':', &jobid); for(k=0;k<maxtok_j;k++){ if(jobid[k] && strlen(jobid[k])==0) continue; pbs_spool=(pbs_spoolpath?make_message("-p %s ",pbs_spoolpath):make_message("")); command_string=make_message("%s%s/tracejob %s-m -l -a -n %d %s",batch_command,pbs_binpath,pbs_spool,logs_to_read,jobid[k]); free(pbs_spool); fp = popen(command_string,"r"); do_log(debuglogfile, debug, 3, "%s: command_string in FinalStateQuery is:%s\n",argv0,command_string); /* en.status is set =0 (UNDEFINED) here and it is tested if it is !=0 before the registry update: the update is done only if en.status is !=0*/ en.status=UNDEFINED; JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,jobid[k]); tracejob_line_counter=0; if(fp!=NULL){ while(!feof(fp) && (line=get_line(fp))){ if(line && strlen(line)==0){ free(line); continue; } if(tracejob_line_counter>tracejob_max_output){ do_log(debuglogfile, debug, 2, "%s: Tracejob output limit of %d lines reached. Skipping command.\n",argv0,tracejob_max_output); free(line); break; } if ((cp = strrchr (line, '\n')) != NULL){ *cp = '\0'; tracejob_line_counter++; } do_log(debuglogfile, debug, 3, "%s: line in FinalStateQuery is:%s\n",argv0,line); now=time(0); string_now=make_message("%d",now); if(line && (strstr(line,"Job deleted") || (strstr(line,"dequeuing from") && strstr(line,"state RUNNING")))){ maxtok_t = strtoken(line, ' ', &token); timestamp=make_message("%s %s",token[0],token[1]); tmstampepoch=str2epoch(timestamp,"A"); free(timestamp); freetoken(&token,maxtok_t); en.udate=tmstampepoch; en.status=REMOVED; en.exitcode=-999; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); }else if(line && strstr(line," Exit_status=") && en.status != REMOVED){ maxtok_t = strtoken(line, ' ', &token); timestamp=make_message("%s %s",token[0],token[1]); tmstampepoch=str2epoch(timestamp,"A"); exit_str=strdup(token[3]); if(exit_str == NULL){ sysfatal("strdup failed for exit_str in FinalStateQuery: %r"); } free(timestamp); freetoken(&token,maxtok_t); if(strstr(exit_str,"Exit_status=")){ maxtok_t = strtoken(exit_str, '=', &token); if(maxtok_t == 2){ en.exitcode=atoi(token[1]); freetoken(&token,maxtok_t); }else{ en.exitcode=-1; } }else{ en.exitcode=-1; } free(exit_str); en.udate=tmstampepoch; en.status=COMPLETED; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); } free(string_now); free(line); } pclose(fp); } if(en.status !=UNDEFINED && en.status!=IDLE){ if ((ret=job_registry_update_select(rha, &en, JOB_REGISTRY_UPDATE_UDATE | JOB_REGISTRY_UPDATE_STATUS | JOB_REGISTRY_UPDATE_UPDATER_INFO | JOB_REGISTRY_UPDATE_EXITCODE | JOB_REGISTRY_UPDATE_EXITREASON )) < 0){ if(ret != JOB_REGISTRY_NOT_FOUND){ fprintf(stderr,"Update of record returns %d: ",ret); perror(""); } } else { do_log(debuglogfile, debug, 2, "%s: registry update in FinalStateQuery for: jobid=%s exitcode=%d status=%d\n",argv0,en.batch_id,en.exitcode,en.status); if (en.status == REMOVED || en.status == COMPLETED){ job_registry_unlink_proxy(rha, &en); } if (remupd_conf != NULL){ if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } } }else{ failed_count++; } free(command_string); } now=time(0); if(failed_count>10){ failed_count=10; } time_to_add=pow(failed_count,1.5); next_finalstatequery=now+time_to_add; do_log(debuglogfile, debug, 3, "%s: next FinalStatequery will be in %d seconds\n",argv0,time_to_add); freetoken(&jobid,maxtok_j); return failed_count; }
int FinalStateQuery(time_t start_date, int logs_to_read) { FILE *fp; char *line=NULL; char **token; char **token_l; int maxtok_t=0; int maxtok_l=0; job_registry_entry en; int ret; time_t tmstampepoch; char *cp=NULL; char *command_string=NULL; time_t now; char *string_now=NULL; job_registry_entry *ren=NULL; command_string=make_message("%s/sacct -nap -o JobID,JobName,State,ExitCode,submit,start,end 2>/dev/null",slurm_binpath); fp = popen(command_string,"r"); do_log(debuglogfile, debug, 3, "%s: command_string in FinalStateQuery is:%s\n",argv0,command_string); en.status=UNDEFINED; JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); if(fp!=NULL){ while(!feof(fp) && (line=get_line(fp))){ if(line && strlen(line)==0){ free(line); continue; } if ((cp = strrchr (line, '\n')) != NULL){ *cp = '\0'; } en.status=UNDEFINED; do_log(debuglogfile, debug, 3, "%s: line in FinalStateQuery is:%s\n",argv0,line); now=time(0); string_now=make_message("%d",now); maxtok_t = strtoken(line, '|', &token); JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,token[0]); JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); if(token[2] && strstr(token[2],"COMPLETED")){ en.status=COMPLETED; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); }else if(token[2] && strstr(token[2],"CANCELLED")){ en.status=REMOVED; en.exitcode=-999; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); }else if(token[2] && strstr(token[2],"FAILED")){ en.status=COMPLETED; JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); } if(!(token[6] && strstr(token[6],"Unknown"))){ tmstampepoch=str2epoch(token[6],"N"); en.udate=tmstampepoch; } if(en.status==COMPLETED){ maxtok_l = strtoken(token[3], ':', &token_l); en.exitcode=atoi(token_l[0]); freetoken(&token_l,maxtok_l); } if ((ren=job_registry_get(rha, en.batch_id)) == NULL){ fprintf(stderr,"Get of record returns error "); perror(""); } if(en.status!=UNDEFINED && en.status!=IDLE && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ if ((ret=job_registry_update_select(rha, &en, JOB_REGISTRY_UPDATE_UDATE | JOB_REGISTRY_UPDATE_STATUS | JOB_REGISTRY_UPDATE_UPDATER_INFO | JOB_REGISTRY_UPDATE_EXITCODE | JOB_REGISTRY_UPDATE_EXITREASON )) < 0){ if(ret != JOB_REGISTRY_NOT_FOUND){ fprintf(stderr,"Update of record returns %d: ",ret); perror(""); } } else { do_log(debuglogfile, debug, 2, "%s: f registry update in FinalStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); if (en.status == REMOVED || en.status == COMPLETED){ job_registry_unlink_proxy(rha, &en); } if (remupd_conf != NULL){ if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } } } free(string_now); free(line); freetoken(&token,maxtok_t); free(ren); } pclose(fp); } free(command_string); return 0; }