// // ReadUserLogHeader methods // int ReadUserLogHeader::Read( ReadUserLog &reader ) { // Now, read the event itself ULogEvent *event = NULL; ULogEventOutcome outcome = reader.readEvent( event ); if ( ULOG_OK != outcome ) { ::dprintf( D_FULLDEBUG, "ReadUserLogHeader::Read(): readEvent() failed\n" ); delete event; return outcome; } if ( ULOG_GENERIC != event->eventNumber ) { ::dprintf( D_FULLDEBUG, "ReadUserLogHeader::Read(): event #%d should be %d\n", event->eventNumber, ULOG_GENERIC ); delete event; return ULOG_NO_EVENT; } int rval = ExtractEvent( event ); delete event; if ( rval != ULOG_OK) { ::dprintf( D_FULLDEBUG, "ReadUserLogHeader::Read(): failed to extract event\n" ); } return rval; }
void read_log(const char *filename, int select_cluster, int select_proc) { ReadUserLog ulog; ULogEventOutcome result; ULogEvent *event=NULL; char hash[40]; HashTable<HashKey, ExecuteEvent *> ExecRecs(1024, hashFunction); HashTable<HashKey, CheckpointedEvent *> CkptRecs(1024, hashFunction); if (ulog.initialize(filename,0,false,true)==false) { fprintf(stderr, "Error: unable to read log file.\n"); exit(0); } while ((result = ulog.readEvent(event)) != ULOG_NO_EVENT) { if ((result == ULOG_OK) && (select_cluster == -1 || select_cluster == event->cluster) && (select_proc == -1 || select_proc == event->proc)) { switch (event->eventNumber) { case ULOG_SUBMIT: delete event; break; case ULOG_EXECUTE: { sprintf(hash, "%d.%d", event->cluster, event->proc); HashKey key(hash); // check if we already have an execute event for this job ExecuteEvent *execEvent; if (ExecRecs.lookup(key, execEvent) >= 0) { // This means we found two execute events for the // job not separated by an evict or terminate // event. Which one should we throw out? If the // executeHosts are the same, then we throw out // the later event, since the 6.1.15 and 6.1.16 // shadows logged execute events before every // other event, so keeping the first execute event // gives correct results for those shadows. // Otherwise, we throw out the previous event. if (!strcmp(((ExecuteEvent *)event)->getExecuteHost(), execEvent->getExecuteHost())) { if (debug_mode) { fprintf(stderr, "warning: discarding execute event " "(job %s)\n found before evict or " "termination event for previous execute " "event.\n", hash); } delete event; break; } if (ExecRecs.remove(key) < 0) { if (debug_mode) { fprintf(stderr, "internal error: hashtable remove " "failed for exec event %s!\n", hash); } delete event; break; } if (debug_mode) { fprintf(stderr, "warning: discarding execute event " "(job %s)\n with no corresponding evict or " "termination event.\n", hash); } delete execEvent; } ExecRecs.insert(key, (ExecuteEvent *)event); break; } case ULOG_CHECKPOINTED: { sprintf(hash, "%d.%d", event->cluster, event->proc); HashKey key(hash); // remove any previous ckpt events for this job CheckpointedEvent *ckptEvent; if (CkptRecs.lookup(key, ckptEvent) >= 0) { if (CkptRecs.remove(key) < 0) { if (debug_mode) { fprintf(stderr, "internal error: hashtable remove " "failed for ckpt event %s!\n", hash); } delete event; break; } delete ckptEvent; } CkptRecs.insert(HashKey(hash), (CheckpointedEvent *)event); break; } case ULOG_JOB_EVICTED: { ExecuteEvent *execEvent; JobEvictedEvent *evictEvent = (JobEvictedEvent *)event; sprintf(hash, "%d.%d", event->cluster, event->proc); HashKey key(hash); if (ExecRecs.lookup(key, execEvent) < 0) { if (debug_mode) { fprintf(stderr, "internal error: can't find execute event for " "evicted job %s!\n", hash); } delete event; break; } if (ExecRecs.remove(key) < 0) { if (debug_mode) { fprintf(stderr, "internal error: hashtable remove failed for " "%s!\n", hash); } delete event; break; } time_t start_time, end_time, ckpt_time=0; start_time = mktime(&execEvent->eventTime); end_time = mktime(&event->eventTime); int cpu_usage = 0; if (evictEvent->checkpointed) { ckpt_time = end_time; cpu_usage = evictEvent->run_remote_rusage.ru_utime.tv_sec + evictEvent->run_remote_rusage.ru_stime.tv_sec; } else { CheckpointedEvent *ckptEvent; if (CkptRecs.lookup(key, ckptEvent) >= 0) { ckpt_time = mktime(&ckptEvent->eventTime); cpu_usage = ckptEvent->run_remote_rusage.ru_utime.tv_sec + ckptEvent->run_remote_rusage.ru_stime.tv_sec; } if (ckpt_time < start_time) { ckpt_time = start_time; cpu_usage = 0; } } new_record(event->cluster, event->proc, (int)start_time, (int)end_time, (int)ckpt_time-start_time, cpu_usage, execEvent->getExecuteHost()); delete execEvent; delete event; break; } case ULOG_JOB_TERMINATED: { ExecuteEvent *execEvent; JobTerminatedEvent *terminateEvent = (JobTerminatedEvent *)event; sprintf(hash, "%d.%d", event->cluster, event->proc); HashKey key(hash); if (ExecRecs.lookup(key, execEvent) < 0) { if (debug_mode) { fprintf(stderr, "internal error: can't find execute event for " "terminated job %s!\n", hash); } delete event; break; } if (ExecRecs.remove(key) < 0) { if (debug_mode) { fprintf(stderr, "internal error: hashtable remove failed " "for %s!\n", hash); } delete event; break; } time_t start_time, end_time; start_time = mktime(&execEvent->eventTime); end_time = mktime(&event->eventTime); if (!evict_only) { new_record(event->cluster, event->proc, (int)start_time, (int)end_time, (int)end_time-start_time, terminateEvent-> run_remote_rusage.ru_utime.tv_sec + terminateEvent-> run_remote_rusage.ru_stime.tv_sec, execEvent->getExecuteHost()); } delete execEvent; delete event; break; } case ULOG_IMAGE_SIZE: delete event; break; case ULOG_EXECUTABLE_ERROR: case ULOG_SHADOW_EXCEPTION: { ExecuteEvent *execEvent; sprintf(hash, "%d.%d", event->cluster, event->proc); HashKey key(hash); if (ExecRecs.lookup(key, execEvent) < 0) { if (debug_mode) { fprintf(stderr, "internal error: can't find execute event for " "job %s!\n", hash); } delete event; break; } if (ExecRecs.remove(key) < 0) { if (debug_mode) { fprintf(stderr, "internal error: hashtable remove failed for " "%s!\n", hash); } delete event; break; } time_t start_time, end_time, ckpt_time=0; start_time = mktime(&execEvent->eventTime); end_time = mktime(&event->eventTime); int cpu_usage = 0; CheckpointedEvent *ckptEvent; if (CkptRecs.lookup(key, ckptEvent) >= 0) { ckpt_time = mktime(&ckptEvent->eventTime); cpu_usage = ckptEvent->run_remote_rusage.ru_utime.tv_sec + ckptEvent->run_remote_rusage.ru_stime.tv_sec; } if (ckpt_time < start_time) { ckpt_time = start_time; cpu_usage = 0; } if (!evict_only) { new_record(event->cluster, event->proc, (int)start_time, (int)end_time, (int)ckpt_time-start_time, cpu_usage, execEvent->getExecuteHost()); } delete execEvent; delete event; break; } default: // ignore the rest of the eveats break; } } else { if (event!=NULL) delete event; if ((result==ULOG_RD_ERROR) || (result == ULOG_UNK_ERROR)) { fprintf(stderr, "Error: unable to read log file.\n"); exit(0); } } } // delete any remaining items in the tables ExecuteEvent *execEvent; ExecRecs.startIterations(); while (ExecRecs.iterate(execEvent) == 1) { delete execEvent; } CheckpointedEvent *ckptEvent; CkptRecs.startIterations(); while (CkptRecs.iterate(ckptEvent) == 1) { delete ckptEvent; } }
int main( int argc, char *argv[] ) { int i; char *log_file_name = 0; char *job_name = 0; time_t waittime=0, stoptime=0; int minjobs = 0; int print_status = false; int echo_events = false; int debug_print_rescue = false; myDistro->Init( argc, argv ); config(); for( i=1; i<argc; i++ ) { if(!strcmp(argv[i],"-help")) { usage(argv[0]); EXIT_SUCCESS; } else if(!strcmp(argv[i],"-version")) { version(); EXIT_FAILURE; } else if(!strcmp(argv[i],"-debug")) { // dprintf to console dprintf_set_tool_debug("TOOL", 0); print_status = false; } else if(!strcmp(argv[i],"-status")) { if (dprintf_to_term_check()) { fprintf(stderr,"-status is implied by -debug\n"); } else { print_status = true; } } else if(!strcmp(argv[i],"-echo")) { echo_events = true; } else if(!strcmp(argv[i],"-wait")) { i++; if(i>=argc) { fprintf(stderr,"-wait requires an argument\n"); usage(argv[0]); EXIT_FAILURE; } waittime = atoi(argv[i]); stoptime = time(0) + waittime; dprintf(D_FULLDEBUG,"Will wait until %s\n",ctime(&stoptime)); } else if( !strcmp( argv[i], "-num" ) ) { i++; if( i >= argc ) { fprintf( stderr, "-num requires an argument\n" ); usage( argv[0] ); EXIT_FAILURE; } minjobs = atoi( argv[i] ); if( minjobs < 1 ) { fprintf( stderr, "-num must be greater than zero\n" ); usage( argv[0] ); EXIT_FAILURE; } dprintf( D_FULLDEBUG, "Will wait until %d jobs end\n", minjobs ); } else if(argv[i][0]!='-') { if(!log_file_name) { log_file_name = argv[i]; } else if(!job_name) { job_name = argv[i]; } else { fprintf(stderr,"Extra argument: %s\n\n",argv[i]); usage(argv[0]); EXIT_FAILURE; } } else { usage(argv[0]); EXIT_FAILURE; } } if( !log_file_name ) { usage(argv[0]); EXIT_FAILURE; } int cluster=ANY_NUMBER; int process=ANY_NUMBER; int subproc=ANY_NUMBER; if( job_name ) { int fields = sscanf(job_name,"%d.%d.%d",&cluster,&process,&subproc); if(fields>=1 && fields<=3) { /* number is fine */ } else { fprintf(stderr,"Couldn't understand job number: %s\n",job_name); EXIT_FAILURE; } } dprintf(D_FULLDEBUG,"Reading log file %s\n",log_file_name); int submitted, aborted, completed, flagged; FILE *sec_fp = NULL; int pos, nPos; rescue : submitted=0; aborted=0; completed=0; flagged = 0; ReadUserLog log ; HashTable<MyString,MyString> table(127,MyStringHash); if(log.initialize(log_file_name,false,false,true)) { sec_fp = safe_fopen_wrapper_follow(log_file_name, "r", 0644); fseek (sec_fp, 0, SEEK_END); pos = ftell(sec_fp); nPos = pos; if (debug_print_rescue) printf("begin:%d ", nPos); while(1) { fseek(sec_fp, 0, SEEK_END); int tmp_pos = ftell(sec_fp); ULogEventOutcome outcome; ULogEvent *event; outcome = log.readEvent(event); if(outcome==ULOG_OK) { flagged = 0; pos = nPos = tmp_pos; if (debug_print_rescue) printf("top:%d ", nPos); char key[1024]; sprintf(key,"%d.%d.%d",event->cluster,event->proc,event->subproc); MyString str(key); if( jobnum_matches( event, cluster, process, subproc ) ) { if (echo_events) { event->putEvent(stdout); printf("...\n"); } if(event->eventNumber==ULOG_SUBMIT) { dprintf(D_FULLDEBUG,"%s submitted\n",key); if (print_status) printf("%s submitted\n", key); table.insert(str,str); submitted++; } else if(event->eventNumber==ULOG_JOB_TERMINATED) { dprintf(D_FULLDEBUG,"%s completed\n",key); if (print_status) printf("%s completed\n", key); table.remove(str); completed++; } else if(event->eventNumber==ULOG_JOB_ABORTED) { dprintf(D_FULLDEBUG,"%s aborted\n",key); if (print_status) printf("%s aborted\n", key); table.remove(str); aborted++; } else if (event->eventNumber==ULOG_EXECUTE) { if (print_status) { printf("%s executing on host %s\n", key, ((ExecuteEvent*)event)->getExecuteHost()); } } else { /* nothing to do */ } } if (event != NULL) delete event; if( minjobs && (completed + aborted >= minjobs ) ) { printf( "Specifed number of jobs (%d) done.\n", minjobs ); EXIT_SUCCESS; } } else { // did something change in the file since our last visit? fseek(sec_fp, 0, SEEK_END); nPos = ftell(sec_fp); if (flagged == 1) { fclose(sec_fp); dprintf(D_FULLDEBUG, "INFO: File %s changed but userLog reader could not read another event. We are reinitializing userLog reader. \n", log_file_name); if (debug_print_rescue) printf("rescue:%d ", nPos); if (print_status) printf("<reinitializing userLog reader>\n"); // reinitialize the user log, we ended up here a second time goto rescue; } if ( nPos != pos ){ if (debug_print_rescue) printf("lagging:%d!=%d ", nPos, pos); pos = nPos; // we do not want to retry every time we are in a waiting sleep cycle, therefore flag a change flagged = 1; } dprintf(D_FULLDEBUG,"%d submitted %d completed %d aborted %d remaining\n",submitted,completed,aborted,submitted-completed-aborted); if(table.getNumElements()==0) { if(submitted>0) { if( !minjobs ) { printf("All jobs done.\n"); EXIT_SUCCESS; } } else { if(cluster==ANY_NUMBER) { fprintf(stderr,"This log does not mention any jobs!\n"); } else { fprintf(stderr,"This log does not mention that job!\n"); } EXIT_FAILURE; } } else if(stoptime && time(0)>stoptime) { printf("Time expired.\n"); EXIT_FAILURE; } else { time_t sleeptime; if(stoptime) { sleeptime = stoptime-time(0); } else { sleeptime = 5; } if(sleeptime>5) { sleeptime = 5; } else if(sleeptime<1) { sleeptime = 1; } log.synchronize(); dprintf(D_FULLDEBUG,"No more events, sleeping for %ld seconds\n", (long)sleeptime); sleep(sleeptime); } } } fclose(sec_fp); } else { fprintf(stderr,"Couldn't open %s: %s\n",log_file_name,strerror(errno)); } EXIT_FAILURE; return 1; /* meaningless, but it makes Windows happy */ }