//
// ReadUserLogHeader methods
//
int
ReadUserLogHeader::Read(
	ReadUserLog	&reader )
{

	// Now, read the event itself
	ULogEvent			*event = NULL;
	ULogEventOutcome	outcome = reader.readEvent( event );

	if ( ULOG_OK != outcome ) {
		::dprintf( D_FULLDEBUG,
				   "ReadUserLogHeader::Read(): readEvent() failed\n" );
		delete event;
		return outcome;
	}
	if ( ULOG_GENERIC != event->eventNumber ) {
		::dprintf( D_FULLDEBUG,
				   "ReadUserLogHeader::Read(): event #%d should be %d\n",
				   event->eventNumber, ULOG_GENERIC );
		delete event;
		return ULOG_NO_EVENT;
	}

	int rval = ExtractEvent( event );
	delete event;

	if ( rval != ULOG_OK) {
		::dprintf( D_FULLDEBUG,
				   "ReadUserLogHeader::Read(): failed to extract event\n" );
	}
	return rval;
}
Beispiel #2
0
void
read_log(const char *filename, int select_cluster, int select_proc)
{
	ReadUserLog ulog;
	ULogEventOutcome result;
	ULogEvent *event=NULL;
	char hash[40];
	HashTable<HashKey, ExecuteEvent *> ExecRecs(1024, hashFunction);
	HashTable<HashKey, CheckpointedEvent *> CkptRecs(1024, hashFunction);
	
	if (ulog.initialize(filename,0,false,true)==false) {
		fprintf(stderr,
			"Error: unable to read log file.\n");
			exit(0);			
			}		
	
	while ((result = ulog.readEvent(event)) != ULOG_NO_EVENT) {
		if ((result == ULOG_OK) &&
			(select_cluster == -1 || select_cluster == event->cluster) &&
			(select_proc == -1 || select_proc == event->proc)) {
			switch (event->eventNumber) {
			case ULOG_SUBMIT:
				delete event;
				break;
			case ULOG_EXECUTE: {
				sprintf(hash, "%d.%d", event->cluster, event->proc);
				HashKey key(hash);
				// check if we already have an execute event for this job
				ExecuteEvent *execEvent;
				if (ExecRecs.lookup(key, execEvent) >= 0) {
					// This means we found two execute events for the
					// job not separated by an evict or terminate
					// event.  Which one should we throw out?  If the
					// executeHosts are the same, then we throw out
					// the later event, since the 6.1.15 and 6.1.16
					// shadows logged execute events before every
					// other event, so keeping the first execute event
					// gives correct results for those shadows.
					// Otherwise, we throw out the previous event.
					if (!strcmp(((ExecuteEvent *)event)->getExecuteHost(),
								execEvent->getExecuteHost())) {
						if (debug_mode) {
							fprintf(stderr,
									"warning: discarding execute event "
									"(job %s)\n  found before evict or "
									"termination event for previous execute "
									"event.\n", hash);
						}
						delete event;
						break;
					}
					if (ExecRecs.remove(key) < 0) {
						if (debug_mode) {
							fprintf(stderr, "internal error: hashtable remove "
									"failed for exec event %s!\n", hash);
						}
						delete event;
						break;
					}
					if (debug_mode) {
						fprintf(stderr, "warning: discarding execute event "
								"(job %s)\n  with no corresponding evict or "
								"termination event.\n", hash);
					}
					delete execEvent;
				}
				ExecRecs.insert(key, (ExecuteEvent *)event);
				break;
			}
			case ULOG_CHECKPOINTED: {
				sprintf(hash, "%d.%d", event->cluster, event->proc);
				HashKey key(hash);
				// remove any previous ckpt events for this job
				CheckpointedEvent *ckptEvent;
				if (CkptRecs.lookup(key, ckptEvent) >= 0) {
					if (CkptRecs.remove(key) < 0) {
						if (debug_mode) {
							fprintf(stderr, "internal error: hashtable remove "
									"failed for ckpt event %s!\n", hash);
						}
						delete event;
						break;
					}
					delete ckptEvent;
				}
				CkptRecs.insert(HashKey(hash), (CheckpointedEvent *)event);
				break;
			}
			case ULOG_JOB_EVICTED: {
				ExecuteEvent *execEvent;
				JobEvictedEvent *evictEvent = (JobEvictedEvent *)event;
				sprintf(hash, "%d.%d", event->cluster, event->proc);
				HashKey key(hash);
				if (ExecRecs.lookup(key, execEvent) < 0) {
					if (debug_mode) {
						fprintf(stderr,
								"internal error: can't find execute event for "
								"evicted job %s!\n", hash);
					}
					delete event;
					break;
				}
				if (ExecRecs.remove(key) < 0) {
					if (debug_mode) {
						fprintf(stderr,
								"internal error: hashtable remove failed for "
								"%s!\n", hash);
					}
					delete event;
					break;
				}
				time_t start_time, end_time, ckpt_time=0;
				start_time = mktime(&execEvent->eventTime);
				end_time = mktime(&event->eventTime);
				int cpu_usage = 0;
				if (evictEvent->checkpointed) {
					ckpt_time = end_time;
					cpu_usage = evictEvent->run_remote_rusage.ru_utime.tv_sec +
						evictEvent->run_remote_rusage.ru_stime.tv_sec;
				} else {
					CheckpointedEvent *ckptEvent;
					if (CkptRecs.lookup(key, ckptEvent) >= 0) {
						ckpt_time = mktime(&ckptEvent->eventTime);
						cpu_usage =
							ckptEvent->run_remote_rusage.ru_utime.tv_sec +
							ckptEvent->run_remote_rusage.ru_stime.tv_sec;
					}
					if (ckpt_time < start_time) {
						ckpt_time = start_time;
						cpu_usage = 0;
					}
				}
				new_record(event->cluster, event->proc, (int)start_time,
						   (int)end_time,
						   (int)ckpt_time-start_time, cpu_usage,
						   execEvent->getExecuteHost());
				delete execEvent;
				delete event;
				break;
			}
			case ULOG_JOB_TERMINATED: {
				ExecuteEvent *execEvent;
				JobTerminatedEvent *terminateEvent =
					(JobTerminatedEvent *)event;
				sprintf(hash, "%d.%d", event->cluster, event->proc);
				HashKey key(hash);
				if (ExecRecs.lookup(key, execEvent) < 0) {
					if (debug_mode) {
						fprintf(stderr,
								"internal error: can't find execute event for "
								"terminated job %s!\n", hash);
					}
					delete event;
					break;
				}
				if (ExecRecs.remove(key) < 0) {
					if (debug_mode) {
						fprintf(stderr,
								"internal error: hashtable remove failed "
								"for %s!\n", hash);
					}
					delete event;
					break;
				}
				time_t start_time, end_time;
				start_time = mktime(&execEvent->eventTime);
				end_time = mktime(&event->eventTime);
				if (!evict_only) {
					new_record(event->cluster, event->proc, (int)start_time,
							   (int)end_time,
							   (int)end_time-start_time,
							   terminateEvent->
							   run_remote_rusage.ru_utime.tv_sec +
							   terminateEvent->
							   run_remote_rusage.ru_stime.tv_sec,
							   execEvent->getExecuteHost());
				}
				delete execEvent;
				delete event;
				break;
			}
			case ULOG_IMAGE_SIZE:
				delete event;
				break;
			case ULOG_EXECUTABLE_ERROR:
			case ULOG_SHADOW_EXCEPTION: {
				ExecuteEvent *execEvent;
				sprintf(hash, "%d.%d", event->cluster, event->proc);
				HashKey key(hash);
				if (ExecRecs.lookup(key, execEvent) < 0) {
					if (debug_mode) {
						fprintf(stderr,
								"internal error: can't find execute event for "
								"job %s!\n", hash);
					}
					delete event;
					break;
				}
				if (ExecRecs.remove(key) < 0) {
					if (debug_mode) {
						fprintf(stderr,
								"internal error: hashtable remove failed for "
								"%s!\n", hash);
					}
					delete event;
					break;
				}
				time_t start_time, end_time, ckpt_time=0;
				start_time = mktime(&execEvent->eventTime);
				end_time = mktime(&event->eventTime);
				int cpu_usage = 0;
				CheckpointedEvent *ckptEvent;
				if (CkptRecs.lookup(key, ckptEvent) >= 0) {
					ckpt_time = mktime(&ckptEvent->eventTime);
					cpu_usage = ckptEvent->run_remote_rusage.ru_utime.tv_sec +
						ckptEvent->run_remote_rusage.ru_stime.tv_sec;
				}
				if (ckpt_time < start_time) {
					ckpt_time = start_time;
					cpu_usage = 0;
				}
				if (!evict_only) {
					new_record(event->cluster, event->proc, (int)start_time,
							   (int)end_time,
							   (int)ckpt_time-start_time, cpu_usage,
							   execEvent->getExecuteHost());
				}
				delete execEvent;
				delete event;
				break;
			}
			default:
				// ignore the rest of the eveats
				break;
			}
		} else {
			
			if (event!=NULL) delete event;
			if ((result==ULOG_RD_ERROR) || (result == ULOG_UNK_ERROR)) {
				fprintf(stderr,
						"Error: unable to read log file.\n");
				exit(0);
			
			}
		}
	}

	// delete any remaining items in the tables
	ExecuteEvent *execEvent;
	ExecRecs.startIterations();
	while (ExecRecs.iterate(execEvent) == 1) {
		delete execEvent;
	}
	CheckpointedEvent *ckptEvent;
	CkptRecs.startIterations();
	while (CkptRecs.iterate(ckptEvent) == 1) {
		delete ckptEvent;
	}
}
Beispiel #3
0
int main( int argc, char *argv[] )
{
	int i;
	char *log_file_name = 0;
	char *job_name = 0;
	time_t waittime=0, stoptime=0;
	int minjobs = 0;
	int print_status = false;
	int echo_events = false;
	int debug_print_rescue = false;

	myDistro->Init( argc, argv );
	config();

	for( i=1; i<argc; i++ ) {
		if(!strcmp(argv[i],"-help")) {
			usage(argv[0]);
			EXIT_SUCCESS;
		} else if(!strcmp(argv[i],"-version")) {
			version();
			EXIT_FAILURE;
		} else if(!strcmp(argv[i],"-debug")) {
			// dprintf to console
			dprintf_set_tool_debug("TOOL", 0);
			print_status = false;
		} else if(!strcmp(argv[i],"-status")) {
			if (dprintf_to_term_check()) {
				fprintf(stderr,"-status is implied by -debug\n");
			} else {
				print_status = true;
			}
		} else if(!strcmp(argv[i],"-echo")) {
			echo_events = true;
		} else if(!strcmp(argv[i],"-wait")) {
			i++;
			if(i>=argc) {
				fprintf(stderr,"-wait requires an argument\n");
				usage(argv[0]);
				EXIT_FAILURE;
			}
			waittime = atoi(argv[i]);
			stoptime = time(0) + waittime;
			dprintf(D_FULLDEBUG,"Will wait until %s\n",ctime(&stoptime));
		} else if( !strcmp( argv[i], "-num" ) ) {
			i++;
			if( i >= argc ) {
				fprintf( stderr, "-num requires an argument\n" );
				usage( argv[0] );
				EXIT_FAILURE;
			}
			minjobs = atoi( argv[i] );
			if( minjobs < 1 ) {
				fprintf( stderr, "-num must be greater than zero\n" );
				usage( argv[0] );
				EXIT_FAILURE;
			}
			dprintf( D_FULLDEBUG, "Will wait until %d jobs end\n", minjobs );
		} else if(argv[i][0]!='-') {
			if(!log_file_name) {
				log_file_name = argv[i];
			} else if(!job_name) {
				job_name = argv[i];
			} else {
				fprintf(stderr,"Extra argument: %s\n\n",argv[i]);
				usage(argv[0]);
				EXIT_FAILURE;
			}
		} else {
			usage(argv[0]);
			EXIT_FAILURE;
		}
	}

	if( !log_file_name ) {
		usage(argv[0]);
		EXIT_FAILURE;
	}

	int cluster=ANY_NUMBER;
	int process=ANY_NUMBER;
	int subproc=ANY_NUMBER;

	if( job_name ) {
		int fields = sscanf(job_name,"%d.%d.%d",&cluster,&process,&subproc);
		if(fields>=1 && fields<=3) {
			/* number is fine */
		} else {
			fprintf(stderr,"Couldn't understand job number: %s\n",job_name);
			EXIT_FAILURE;
		}			
	}
	
	dprintf(D_FULLDEBUG,"Reading log file %s\n",log_file_name);
	int submitted, aborted, completed, flagged;
	FILE *sec_fp = NULL;
	int pos, nPos;
rescue :
	submitted=0;
	aborted=0;
	completed=0;
	flagged = 0;
	ReadUserLog log ;
	HashTable<MyString,MyString> table(127,MyStringHash);
	
	if(log.initialize(log_file_name,false,false,true)) {
		sec_fp = safe_fopen_wrapper_follow(log_file_name, "r", 0644);
		fseek (sec_fp, 0, SEEK_END);
		pos = ftell(sec_fp); 
		nPos = pos;	if (debug_print_rescue) printf("begin:%d ", nPos);
		while(1) {
			fseek(sec_fp, 0, SEEK_END);
			int tmp_pos = ftell(sec_fp);
			
			ULogEventOutcome outcome;
			ULogEvent *event;
			outcome = log.readEvent(event);
			
			if(outcome==ULOG_OK) {
				flagged = 0;
				pos = nPos = tmp_pos;
				if (debug_print_rescue) printf("top:%d ", nPos);

				char key[1024];
				sprintf(key,"%d.%d.%d",event->cluster,event->proc,event->subproc);
				MyString str(key);
				if( jobnum_matches( event, cluster, process, subproc ) ) {

					if (echo_events) {
						event->putEvent(stdout);
						printf("...\n");
					}

					if(event->eventNumber==ULOG_SUBMIT) {
						dprintf(D_FULLDEBUG,"%s submitted\n",key);
						if (print_status) printf("%s submitted\n", key);
						table.insert(str,str);
						submitted++;
					} else if(event->eventNumber==ULOG_JOB_TERMINATED) {
						dprintf(D_FULLDEBUG,"%s completed\n",key);
						if (print_status) printf("%s completed\n", key);
						table.remove(str);
						completed++;
					} else if(event->eventNumber==ULOG_JOB_ABORTED) {
						dprintf(D_FULLDEBUG,"%s aborted\n",key);
						if (print_status) printf("%s aborted\n", key);
						table.remove(str);
						aborted++;
					} else if (event->eventNumber==ULOG_EXECUTE) {
						if (print_status) {
							printf("%s executing on host %s\n", key, ((ExecuteEvent*)event)->getExecuteHost());
						}
					} else {
						/* nothing to do */
					}
				}
				if (event != NULL) delete event;
				if( minjobs && (completed + aborted >= minjobs ) ) {
					printf( "Specifed number of jobs (%d) done.\n", minjobs );
					EXIT_SUCCESS;
				}
			} else {
				// did something change in the file since our last visit?
				fseek(sec_fp, 0, SEEK_END);
				nPos = ftell(sec_fp);
				if (flagged == 1) {
					fclose(sec_fp);
					dprintf(D_FULLDEBUG, "INFO: File %s changed but userLog reader could not read another event. We are reinitializing userLog reader. \n", log_file_name);
					if (debug_print_rescue) printf("rescue:%d ", nPos);
					if (print_status) printf("<reinitializing userLog reader>\n");
					// reinitialize the user log, we ended up here a second time 
					goto rescue;
				}
				if ( nPos != pos ){
					if (debug_print_rescue) printf("lagging:%d!=%d ", nPos, pos);
					pos = nPos;
					// we do not want to retry every time we are in a waiting sleep cycle, therefore flag a change
					flagged = 1;
				}
				
				dprintf(D_FULLDEBUG,"%d submitted %d completed %d aborted %d remaining\n",submitted,completed,aborted,submitted-completed-aborted);
				if(table.getNumElements()==0) {
					if(submitted>0) {
						if( !minjobs ) {
							printf("All jobs done.\n");
							EXIT_SUCCESS;
						}
					} else {
						if(cluster==ANY_NUMBER) {
							fprintf(stderr,"This log does not mention any jobs!\n");
						} else {
							fprintf(stderr,"This log does not mention that job!\n");
						}
						EXIT_FAILURE;
					}
				} else if(stoptime && time(0)>stoptime) {
					printf("Time expired.\n");
					EXIT_FAILURE;
				} else {
					time_t sleeptime;

					if(stoptime) {
						sleeptime = stoptime-time(0);
					} else {
						sleeptime = 5;
					}

					if(sleeptime>5) {
						sleeptime = 5;
					} else if(sleeptime<1) {
						sleeptime = 1;
					}

					log.synchronize();
					dprintf(D_FULLDEBUG,"No more events, sleeping for %ld seconds\n", (long)sleeptime);
					sleep(sleeptime);
					
				}
			}
		}
		fclose(sec_fp);
	} else {
		fprintf(stderr,"Couldn't open %s: %s\n",log_file_name,strerror(errno));
	}
	EXIT_FAILURE;
	
	return 1; /* meaningless, but it makes Windows happy */
}