Exemplo n.º 1
0
int main(int argc, char *argv[]){
    
    FILE *fd;
    job_registry_entry *en;
    time_t now;
    time_t purge_time=0;
    char constraint[JOBID_MAX_LEN+1];
    char constraint2[5];
    char *query=NULL;
    char *queryStates=NULL;
    char *query_err=NULL;

    char *pidfile=NULL;
    char string_now[11];
    char *tpath;
    
    int version=0;
    int tmptim;
    int finstr_len=0;
    int loop_interval=DEFAULT_LOOP_INTERVAL;
    
    int fsq_ret=0;
    
    int c;
    
    int confirm_time=0;
    
    static int help;
    static int short_help;
    
    while (1) {
	static struct option long_options[] =
	{
	    {"help",      no_argument,     &help,       1},
	    {"usage",     no_argument,     &short_help, 1},
	    {"nodaemon",  no_argument,       0, 'o'},
	    {"version",   no_argument,       0, 'v'},
	    {0, 0, 0, 0}
	};
	
	int option_index = 0;
	
	c = getopt_long (argc, argv, "vo",long_options, &option_index);
	
	if (c == -1){
	    break;
	}
	
	switch (c)
	{
	    
	    case 0:
		if (long_options[option_index].flag != 0){
		    break;
		}
		
	    case 'v':
		version=1;
		break;
		
	    case 'o':
		nodmn=1;
		break;
		
	    case '?':
		break;
		
	    default:
		abort ();
	}
    }
    
    //check if another instance is running 
    char **ptr;
    char out[3];
    fgets(out, sizeof(out),popen("ps -d | grep -c BUpdaterSGE","r"));
    strtoken(out,'\n',&ptr);
    if (strcmp(ptr[0],"1")!=0){
	fprintf(stderr,"There is another instance of BUpdaterSGE running.\nExiting ...\n");
	return -1;
    }
    freetoken(&ptr,1);

    if(help){
	usage();
    }
    
    if(short_help){
	short_usage();
    }
    
    argv0 = argv[0];
    
    signal(SIGHUP,sighup);
    
    if(version) {
	printf("%s Version: %s\n",progname,VERSION);
	exit(EXIT_SUCCESS);
    }  
    
    /* Checking configuration */
    check_config_file("UPDATER"); 
    
    cha = config_read(NULL);
    if (cha == NULL)
    {
	fprintf(stderr,"Error reading config: ");
	perror("");
	return -1;
    }
    config_setenv(NULL);
    
    ret = config_get("bupdater_child_poll_timeout",cha);
    if (ret != NULL){
	tmptim=atoi(ret->value);
	if (tmptim > 0) bfunctions_poll_timeout = tmptim*1000;
    }
    
    ret = config_get("bupdater_debug_level",cha);
    if (ret != NULL){
	debug=atoi(ret->value);
    }
    
    ret = config_get("bupdater_debug_logfile",cha);
    if (ret != NULL){
	debuglogname=strdup(ret->value);
	if(debuglogname == NULL){
	    sysfatal("strdup failed for debuglogname in main: %r");
	}
    }
    if(debug <=0){
	debug=0;
    }
    
    if(debuglogname){
	if((debuglogfile = fopen(debuglogname, "a+"))==0){
	    debug = 0;
	}
    }else{
	debug = 0;
    }

    ret = config_get("debug_level",cha);
    if (ret != NULL){
	debug=atoi(ret->value);
    }

    ret = config_get("debug_logfile",cha);
    if (ret != NULL){
	debuglogname=strdup(ret->value);
	if(debuglogname == NULL){
	    sysfatal("strdup failed for debuglogname in main: %r");
	}
    }
    if(debug <=0){
	debug=0;
    }

    if(debuglogname){
	if((debuglogfile = fopen(debuglogname, "a+"))==0){
	    debug = 0;
	}
    }else{
	debug = 0;
    }

    ret = config_get("sge_binpath",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key sge_binpath not found\n",argv0);
    } else {
	sge_binpath=strdup(ret->value);
	if(sge_binpath == NULL){
	    sysfatal("strdup failed for sge_binpath in main: %r");
	}
    }

    ret = config_get("sge_rootpath",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key sge_rootpath not found\n",argv0);
    } else {
	sge_rootpath=strdup(ret->value);
	if(sge_rootpath == NULL){
	    sysfatal("strdup failed for sge_rootpath in main: %r");
	}
	
	tpath=make_message("%s",sge_rootpath);
	if (opendir(tpath)==NULL){
	    do_log(debuglogfile, debug, 1, "%s: dir %s does not exist or is not readable\n",argv0,tpath);
	    sysfatal("dir %s does not exist or is not readable: %r",tpath);
	}
	free(tpath);
    }

    ret = config_get("sge_cellname",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key sge_cellname not found\n",argv0);
    } else {
	sge_cellname=strdup(ret->value);
	if(sge_cellname == NULL){
	    sysfatal("strdup failed for sge_cellname in main: %r");
	}
    }

    ret = config_get("sge_rootpath",cha);
    if (ret == NULL){
	if(debug){
	    fprintf(debuglogfile, "%s: key sge_rootpath not found\n",argv0);
	    fflush(debuglogfile);
	}
    } else {
	sge_rootpath=strdup(ret->value);
	if(sge_rootpath == NULL){
	    sysfatal("strdup failed for sge_rootpath in main: %r");
	}
    }

    ret = config_get("sge_cellname",cha);
    if (ret == NULL){
	if(debug){
	    fprintf(debuglogfile, "%s: key sge_cellname not found\n",argv0);
	    fflush(debuglogfile);
	}
    } else {
	sge_cellname=strdup(ret->value);
	if(sge_cellname == NULL){
	    sysfatal("strdup failed for sge_cellname in main: %r");
	}
    }

    ret = config_get("job_registry",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key job_registry not found\n",argv0);
	sysfatal("job_registry not defined. Exiting");
    } else {
	reg_file=strdup(ret->value);
	if(reg_file == NULL){
	    sysfatal("strdup failed for reg_file in main: %r");
	}
    }

    ret = config_get("purge_interval",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key purge_interval not found using the default:%d\n",argv0,purge_interval);
    } else {
	purge_interval=atoi(ret->value);
    }

    ret = config_get("finalstate_query_interval",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key finalstate_query_interval not found using the default:%d\n",argv0,finalstate_query_interval);
    } else {
	finalstate_query_interval=atoi(ret->value);
    }

    ret = config_get("alldone_interval",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key alldone_interval not found using the default:%d\n",argv0,alldone_interval);
    } else {
	alldone_interval=atoi(ret->value);
    }

    ret = config_get("bupdater_loop_interval",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key bupdater_loop_interval not found using the default:%d\n",argv0,loop_interval);
    } else {
	loop_interval=atoi(ret->value);
    }

    ret = config_get("bupdater_pidfile",cha);
    if (ret == NULL){
	do_log(debuglogfile, debug, 1, "%s: key bupdater_pidfile not found\n",argv0);
    } else {
	pidfile=strdup(ret->value);
	if(pidfile == NULL){
	    sysfatal("strdup failed for pidfile in main: %r");
	}
    }

    ret = config_get("job_registry_use_mmap",cha);
    if (ret == NULL){
        do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap not found. Default is NO\n",argv0);
    } else {
        do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap is set to %s\n",argv0,ret->value);
    }

    if( !nodmn ) daemonize();

    if( pidfile ){
	writepid(pidfile);
	free(pidfile);
    }

    config_free(cha);
    rha=job_registry_init(reg_file, BY_BATCH_ID);
    if (rha == NULL){
	do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,reg_file);
	fprintf(stderr,"%s: Error initialising job registry %s :",argv0,reg_file);
	perror("");
    }
   for(;;){
	/* Purge old entries from registry */
	now=time(0);
	if(now - purge_time > 86400){
	    if(job_registry_purge(reg_file, now-purge_interval,0)<0){
		do_log(debuglogfile, debug, 1, "%s: Error purging job registry %s\n",argv0,reg_file);
		fprintf(stderr,"%s: Error purging job registry %s :",argv0,reg_file);
		perror("");
	    }else{
		purge_time=time(0);
	    }
	}
	
	//IntStateQuery();
	fd = job_registry_open(rha, "r");
	if (fd == NULL)
	{
	    do_log(debuglogfile, debug, 1, "%s: Error opening job registry %s\n",argv0,reg_file);
	    fprintf(stderr,"%s: Error opening job registry %s :",argv0,reg_file);
	    perror("");
	    sleep(loop_interval);
	}
	if (job_registry_rdlock(rha, fd) < 0)
	{
	    do_log(debuglogfile, debug, 1, "%s: Error read locking job registry %s\n",argv0,reg_file);
	    fprintf(stderr,"%s: Error read locking job registry %s :",argv0,reg_file);
	    perror("");
	    sleep(loop_interval);
	}
	job_registry_firstrec(rha,fd);
	fseek(fd,0L,SEEK_SET);

	if((query=calloc(STR_CHARS*2,1)) == 0){
	    sysfatal("can't malloc query %r");
	}
	if((queryStates=calloc(STR_CHARS*2,1)) == 0){
	    sysfatal("can't malloc query %r");
	}
	
	query[0]=' ';
	queryStates[0]=' ';
	while ((en = job_registry_get_next(rha, fd)) != NULL)
	{
	    if(((now - en->mdate) > finalstate_query_interval) && en->status!=3 && en->status!=4)
	    {
		/* create the constraint that will be used in condor_history command in FinalStateQuery*/
		snprintf(constraint, sizeof(constraint), " %s",en->batch_id);
		if (en->status==0) snprintf(constraint2, sizeof(constraint2), " u");
		if (en->status==1) snprintf(constraint2, sizeof(constraint2), " q");
		if (en->status==2) snprintf(constraint2, sizeof(constraint2), " r");
		if (en->status==5) snprintf(constraint2, sizeof(constraint2), " h");
		query=realloc(query,strlen(query)+strlen(constraint)+1);
		queryStates=realloc(queryStates,strlen(queryStates)+strlen(constraint2)+1);
		strcat(query,constraint);
		strcat(queryStates,constraint2);
		runfinal=TRUE;
	    }
	    /* Assign Status=4 and ExitStatus=-1 to all entries that after alldone_interval are still not in a final state(3 or 4) */
	    if((now - en->mdate > alldone_interval) && en->status!=3 && en->status!=4 && !runfinal)
	    {
		time_t now;
		now=time(0);
		snprintf(string_now,sizeof(string_now),"%d",now);
		AssignState(en->batch_id,"4" ,"-1","\0","\0",string_now);
	    }
	   free(en);
	}
	if(runfinal){
	    if((query_err=calloc((int)strlen(query),1)) == 0)
		sysfatal("can't malloc query_err %r");
	    FinalStateQuery(query,queryStates,query_err);
	    free(query_err);
	}
	free(query);
	free(queryStates);
	fclose(fd);
	if (runfinal){
	    runfinal=FALSE;
	}
	sleep (loop_interval);
    } //for

    job_registry_destroy(rha);
    return(0);
}
Exemplo n.º 2
0
int main(int argc, char *argv[]){

	FILE *fd;
	job_registry_entry *en;
	time_t now;
	time_t purge_time=0;
	time_t last_consistency_check=0;
	char *pidfile=NULL;
	char *final_string=NULL;
	char *cp=NULL;
	char *tpath;
	char *tspooldir;
	
	char *first_duplicate=NULL;
	
	struct pollfd *remupd_pollset = NULL;
	int remupd_nfds;
	
	int version=0;
	int first=TRUE;
	int tmptim;
	int finstr_len=0;
	int loop_interval=DEFAULT_LOOP_INTERVAL;
	
	int fsq_ret=0;
	
	int c;				
	
	pthread_t RecUpdNetThd;

	int confirm_time=0;	

        static int help;
        static int short_help;
	
	bact.njobs = 0;
	bact.jobs = NULL;
	
	while (1) {
		static struct option long_options[] =
		{
		{"help",      no_argument,     &help,       1},
		{"usage",     no_argument,     &short_help, 1},
		{"nodaemon",  no_argument,       0, 'o'},
		{"version",   no_argument,       0, 'v'},
		{"prefix",    required_argument, 0, 'p'},
		{0, 0, 0, 0}
		};

		int option_index = 0;
     
		c = getopt_long (argc, argv, "vop:",long_options, &option_index);
     
		if (c == -1){
			break;
		}
     
		switch (c)
		{

		case 0:
		if (long_options[option_index].flag != 0){
			break;
		}
     
		case 'v':
			version=1;
			break;
	       
		case 'o':
			nodmn=1;
			break;

		case 'p':
			break;

		case '?':
			break;
     
		default:
			abort ();
		}
	}
	
	if(help){
		usage();
	}
	 
	if(short_help){
		short_usage();
	}
	
	argv0 = argv[0];

        signal(SIGHUP,sighup);

	if(version) {
		printf("%s Version: %s\n",progname,VERSION);
		exit(EXIT_SUCCESS);
	}   

        /* Checking configuration */
        check_config_file("UPDATER");

	cha = config_read(NULL);
	if (cha == NULL)
	{
		fprintf(stderr,"Error reading config: ");
		perror("");
		return -1;
	}

	ret = config_get("bupdater_child_poll_timeout",cha);
	if (ret != NULL){
		tmptim=atoi(ret->value);
		if (tmptim > 0) bfunctions_poll_timeout = tmptim*1000;
	}

	ret = config_get("bupdater_debug_level",cha);
	if (ret != NULL){
		debug=atoi(ret->value);
	}
	
	ret = config_get("bupdater_debug_logfile",cha);
	if (ret != NULL){
		debuglogname=strdup(ret->value);
                if(debuglogname == NULL){
                        sysfatal("strdup failed for debuglogname in main: %r");
                }
	}
	if(debug <=0){
		debug=0;
	}
    
	if(debuglogname){
		if((debuglogfile = fopen(debuglogname, "a+"))==0){
			debug = 0;
		}
	}else{
		debug = 0;
	}
	
        ret = config_get("pbs_binpath",cha);
        if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key pbs_binpath not found\n",argv0);
        } else {
                pbs_binpath=strdup(ret->value);
                if(pbs_binpath == NULL){
                        sysfatal("strdup failed for pbs_binpath in main: %r");
                }
        }
	
        ret = config_get("pbs_spoolpath",cha);
        if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key pbs_spoolpath not found\n",argv0);
        } else {
                pbs_spoolpath=strdup(ret->value);
                if(pbs_spoolpath == NULL){
                        sysfatal("strdup failed for pbs_spoolpath in main: %r");
                }

		tpath=make_message("%s/server_logs",pbs_spoolpath);
		if (opendir(tpath)==NULL){
			do_log(debuglogfile, debug, 1, "%s: dir %s does not exist or is not readable. Trying now pbs commands\n",argv0,tpath);
                	tspooldir=GetPBSSpoolPath(pbs_binpath);
                	free(tpath);
                	tpath=make_message("%s/server_logs",tspooldir);
                	free(tspooldir);
                	if (opendir(tpath)==NULL){
				do_log(debuglogfile, debug, 1, "dir %s does not exist or is not readable (using pbs commands)",tpath);
				sysfatal("dir %s does not exist or is not readable (using pbs commands): %r",tpath);
                	}
		}
		free(tpath);
        }
	
	ret = config_get("job_registry",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key job_registry not found\n",argv0);
		sysfatal("job_registry not defined. Exiting");
	} else {
		registry_file=strdup(ret->value);
                if(registry_file == NULL){
                        sysfatal("strdup failed for registry_file in main: %r");
                }
	}
	
	ret = config_get("purge_interval",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key purge_interval not found using the default:%d\n",argv0,purge_interval);
	} else {
		purge_interval=atoi(ret->value);
	}
	
	ret = config_get("finalstate_query_interval",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key finalstate_query_interval not found using the default:%d\n",argv0,finalstate_query_interval);
	} else {
		finalstate_query_interval=atoi(ret->value);
	}
	
	ret = config_get("alldone_interval",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key alldone_interval not found using the default:%d\n",argv0,alldone_interval);
	} else {
		alldone_interval=atoi(ret->value);
	}

	ret = config_get("tracejob_logs_to_read",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key tracejob_logs_to_read not found using the default:%d\n",argv0,tracejob_logs_to_read);
	} else {
		tracejob_logs_to_read=atoi(ret->value);
	}
	
	ret = config_get("bupdater_loop_interval",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key bupdater_loop_interval not found using the default:%d\n",argv0,loop_interval);
	} else {
		loop_interval=atoi(ret->value);
	}
	
	ret = config_get("bupdater_consistency_check_interval",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key bupdater_consistency_check_interval not found using the default:%d\n",argv0,bupdater_consistency_check_interval);
	} else {
		bupdater_consistency_check_interval=atoi(ret->value);
	}
	
	ret = config_get("bupdater_pidfile",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key bupdater_pidfile not found\n",argv0);
	} else {
		pidfile=strdup(ret->value);
                if(pidfile == NULL){
                        sysfatal("strdup failed for pidfile in main: %r");
                }
	}
	
	ret = config_get("pbs_batch_caching_enabled",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key pbs_batch_caching_enabled not found using default\n",argv0,pbs_batch_caching_enabled);
	} else {
		pbs_batch_caching_enabled=strdup(ret->value);
                if(pbs_batch_caching_enabled == NULL){
                        sysfatal("strdup failed for pbs_batch_caching_enabled in main: %r");
                }
	}
	
	ret = config_get("batch_command_caching_filter",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key batch_command_caching_filter not found using default\n",argv0,batch_command_caching_filter);
	} else {
		batch_command_caching_filter=strdup(ret->value);
                if(batch_command_caching_filter == NULL){
                        sysfatal("strdup failed for batch_command_caching_filter in main: %r");
                }
	}
	
	batch_command=(strcmp(pbs_batch_caching_enabled,"yes")==0?make_message("%s ",batch_command_caching_filter):make_message(""));

	ret = config_get("job_registry_use_mmap",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap not found. Default is NO\n",argv0);
	} else {
		do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap is set to %s\n",argv0,ret->value);
	}
	
	ret = config_get("tracejob_max_output",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key tracejob_max_output not found using default\n",argv0,tracejob_max_output);
	} else {
		tracejob_max_output==atoi(ret->value);
	}
	
	remupd_conf = config_get("job_registry_add_remote",cha);
	if (remupd_conf == NULL){
		do_log(debuglogfile, debug, 1, "%s: key job_registry_add_remote not found\n",argv0);
	}else{
		if (job_registry_updater_setup_receiver(remupd_conf->values,remupd_conf->n_values,&remupd_head) < 0){
			do_log(debuglogfile, debug, 1, "%s: Cannot set network receiver(s) up for remote update\n",argv0);
			fprintf(stderr,"%s: Cannot set network receiver(s) up for remote update \n",argv0);
       		}
 
		if (remupd_head == NULL){
			do_log(debuglogfile, debug, 1, "%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n",argv0);
			fprintf(stderr,"%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n", argv0);
		}

		if ((remupd_nfds = job_registry_updater_get_pollfd(remupd_head, &remupd_pollset)) < 0){
			do_log(debuglogfile, debug, 1, "%s: Cannot setup poll set for receiving data.\n",argv0);
    			fprintf(stderr,"%s: Cannot setup poll set for receiving data.\n", argv0);
		}
		if (remupd_pollset == NULL || remupd_nfds == 0){
			do_log(debuglogfile, debug, 1, "%s: No poll set available for receiving data.\n",argv0);
			fprintf(stderr,"%s: No poll set available for receiving data.\n",argv0);
		}
	
	}
	
	if( !nodmn ) daemonize();


	if( pidfile ){
		writepid(pidfile);
		free(pidfile);
	}
	
	rha=job_registry_init(registry_file, BY_BATCH_ID);
	if (rha == NULL){
		do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,registry_file);
		fprintf(stderr,"%s: Error initialising job registry %s :",argv0,registry_file);
		perror("");
	}
	
	if (remupd_conf != NULL){
		pthread_create(&RecUpdNetThd, NULL, (void *(*)(void *))ReceiveUpdateFromNetwork, (void *)NULL);
	
		if (job_registry_updater_setup_sender(remupd_conf->values,remupd_conf->n_values,0,&remupd_head_send) < 0){
			do_log(debuglogfile, debug, 1, "%s: Cannot set network sender(s) up for remote update\n",argv0);
			fprintf(stderr,"%s: Cannot set network sender(s) up for remote update \n",argv0);
       		}
		if (remupd_head_send == NULL){
			do_log(debuglogfile, debug, 1, "%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n",argv0);
			fprintf(stderr,"%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n", argv0);
		}
	}

	config_free(cha);

	for(;;){
		/* Purge old entries from registry */
		now=time(0);
		if(now - purge_time > 86400){
			if(job_registry_purge(registry_file, now-purge_interval,0)<0){
				do_log(debuglogfile, debug, 1, "%s: Error purging job registry %s\n",argv0,registry_file);
                	        fprintf(stderr,"%s: Error purging job registry %s :",argv0,registry_file);
                	        perror("");

			}else{
				purge_time=time(0);
			}
		}
		
		now=time(0);
		if(now - last_consistency_check > bupdater_consistency_check_interval){
			if(job_registry_check_index_key_uniqueness(rha,&first_duplicate)==JOB_REGISTRY_FAIL){
				do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\n",argv0,first_duplicate);
               	        	fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s",argv0,first_duplicate);
 
			}else{
				last_consistency_check=time(0);
			}
		}
	       
		IntStateQuery();
		
		fd = job_registry_open(rha, "r");
		
		if (fd == NULL){
			do_log(debuglogfile, debug, 1, "%s: Error opening job registry %s\n",argv0,registry_file);
			fprintf(stderr,"%s: Error opening job registry %s :",argv0,registry_file);
			perror("");
			sleep(loop_interval);
			continue;
		}
		if (job_registry_rdlock(rha, fd) < 0){
			do_log(debuglogfile, debug, 1, "%s: Error read locking job registry %s\n",argv0,registry_file);
			fprintf(stderr,"%s: Error read locking job registry %s :",argv0,registry_file);
			perror("");
			sleep(loop_interval);
			continue;
		}
		job_registry_firstrec(rha,fd);
		fseek(fd,0L,SEEK_SET);
		
		first=TRUE;
		
		while ((en = job_registry_get_next(rha, fd)) != NULL){
			if((bupdater_lookup_active_jobs(&bact, en->batch_id) != BUPDATER_ACTIVE_JOBS_SUCCESS) && en->status!=REMOVED && en->status!=COMPLETED){
				
				confirm_time=atoi(en->updater_info);
				if(confirm_time==0){
					confirm_time=en->mdate;
				}
			
				/* Assign Status=4 and ExitStatus=999 to all entries that after alldone_interval are still not in a final state(3 or 4)*/
				if(now-confirm_time>alldone_interval){
					AssignFinalState(en->batch_id);	
					free(en);
					continue;
				}
			
				if((now-confirm_time>finalstate_query_interval) && (now > next_finalstatequery)){
					if((final_string=realloc(final_string,finstr_len + strlen(en->batch_id) + 2)) == 0){
                       	        		sysfatal("can't malloc final_string: %r");
					} else {
						if (finstr_len == 0) final_string[0] = '\000';
					}
 					strcat(final_string,en->batch_id);
					strcat(final_string,":");
					finstr_len=strlen(final_string);
					runfinal=TRUE;
				}
				
			}
			free(en);
		}
		
		if(runfinal){
			if (final_string[finstr_len-1] == ':' && (cp = strrchr (final_string, ':')) != NULL){
				*cp = '\0';
			}
				
			if(fsq_ret != 0){
				fsq_ret=FinalStateQuery(final_string,tracejob_logs_to_read);
			}else{
				fsq_ret=FinalStateQuery(final_string,1);
			}
			
			runfinal=FALSE;
		}
		if (final_string != NULL){
			free(final_string);		
			final_string = NULL;
			finstr_len = 0;
		}
		fclose(fd);		
		sleep(loop_interval);
	}
	
	job_registry_destroy(rha);
		
	return 0;
	
}
Exemplo n.º 3
0
int main(int argc, char *argv[]){

	FILE *fd;
	job_registry_entry *en;
	time_t now;
	time_t purge_time=0;
	time_t last_consistency_check=0;
	char *pidfile=NULL;
	char *first_duplicate=NULL;
	
	struct pollfd *remupd_pollset = NULL;
	int remupd_nfds;
	
	int version=0;
	int first=TRUE;
	int tmptim;
	time_t finalquery_start_date;
	int loop_interval=DEFAULT_LOOP_INTERVAL;
	
	int rc;				
	int c;				
	
	pthread_t RecUpdNetThd;

	int confirm_time=0;	

        static int help;
        static int short_help;
	
	bact.njobs = 0;
	bact.jobs = NULL;
	
	while (1) {
		static struct option long_options[] =
		{
		{"help",      no_argument,     &help,       1},
		{"usage",     no_argument,     &short_help, 1},
		{"nodaemon",  no_argument,       0, 'o'},
		{"version",   no_argument,       0, 'v'},
		{"prefix",    required_argument, 0, 'p'},
		{0, 0, 0, 0}
		};

		int option_index = 0;
     
		c = getopt_long (argc, argv, "vop:",long_options, &option_index);
     
		if (c == -1){
			break;
		}
     
		switch (c)
		{

		case 0:
		if (long_options[option_index].flag != 0){
			break;
		}
     
		case 'v':
			version=1;
			break;
	       
		case 'o':
			nodmn=1;
			break;

		case 'p':
			break;

		case '?':
			break;
     
		default:
			abort ();
		}
	}
	
	if(help){
		usage();
	}
	 
	if(short_help){
		short_usage();
	}
	
	argv0 = argv[0];
	
        signal(SIGHUP,sighup);

	if(version) {
		printf("%s Version: %s\n",progname,VERSION);
		exit(EXIT_SUCCESS);
	}   

        /* Checking configuration */
        check_config_file("UPDATER");

	cha = config_read(NULL);
	if (cha == NULL)
	{
		fprintf(stderr,"Error reading config: ");
		perror("");
		return -1;
	}
        config_setenv(NULL);

	ret = config_get("bupdater_child_poll_timeout",cha);
	if (ret != NULL){
		tmptim=atoi(ret->value);
		if (tmptim > 0) bfunctions_poll_timeout = tmptim*1000;
	}

	ret = config_get("bupdater_debug_level",cha);
	if (ret != NULL){
		debug=atoi(ret->value);
	}
	
	ret = config_get("bupdater_debug_logfile",cha);
	if (ret != NULL){
		debuglogname=strdup(ret->value);
                if(debuglogname == NULL){
                        sysfatal("strdup failed for debuglogname in main: %r");
                }
	}
	if(debug <=0){
		debug=0;
	}
    
	if(debuglogname){
		if((debuglogfile = fopen(debuglogname, "a+"))==0){
			debug = 0;
		}
	} else {
		debug = 0;
	}
	
        ret = config_get("slurm_binpath",cha);
        if (ret == NULL){
                do_log(debuglogfile, debug, 1, "%s: key slurm_binpath not found\n",argv0);
        } else {
                slurm_binpath=strdup(ret->value);
                if(slurm_binpath == NULL){
                        sysfatal("strdup failed for slurm_binpath in main: %r");
                }
        }
	
	ret = config_get("job_registry",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key job_registry not found\n",argv0);
		sysfatal("job_registry not defined. Exiting");
	} else {
		registry_file=strdup(ret->value);
                if(registry_file == NULL){
                        sysfatal("strdup failed for registry_file in main: %r");
                }
	}
	
	ret = config_get("purge_interval",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key purge_interval not found using the default:%d\n",argv0,purge_interval);
	} else {
		purge_interval=atoi(ret->value);
	}
	
	ret = config_get("finalstate_query_interval",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key finalstate_query_interval not found using the default:%d\n",argv0,finalstate_query_interval);
	} else {
		finalstate_query_interval=atoi(ret->value);
	}
	
	ret = config_get("alldone_interval",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key alldone_interval not found using the default:%d\n",argv0,alldone_interval);
	} else {
		alldone_interval=atoi(ret->value);
	}
	
	ret = config_get("bupdater_consistency_check_interval",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key bupdater_consistency_check_interval not found using the default:%d\n",argv0,bupdater_consistency_check_interval);
	} else {
		bupdater_consistency_check_interval=atoi(ret->value);
	}

	ret = config_get("bupdater_pidfile",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key bupdater_pidfile not found\n",argv0);
	} else {
		pidfile=strdup(ret->value);
                if(pidfile == NULL){
                        sysfatal("strdup failed for pidfile in main: %r");
                }
	}

	ret = config_get("bupdater_loop_interval",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key bupdater_loop_interval not found - using the default:%d\n",argv0,loop_interval);
	} else {
		loop_interval=atoi(ret->value);
	}
	
	ret = config_get("job_registry_use_mmap",cha);
	if (ret == NULL){
		do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap not found. Default is NO\n",argv0);
	} else {
		do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap is set to %s\n",argv0,ret->value);
	}
	
	remupd_conf = config_get("job_registry_add_remote",cha);
	if (remupd_conf == NULL){
		do_log(debuglogfile, debug, 1, "%s: key job_registry_add_remote not found\n",argv0);
	}else{
		if (job_registry_updater_setup_receiver(remupd_conf->values,remupd_conf->n_values,&remupd_head) < 0){
			do_log(debuglogfile, debug, 1, "%s: Cannot set network receiver(s) up for remote update\n",argv0);
			fprintf(stderr,"%s: Cannot set network receiver(s) up for remote update \n",argv0);
       		}
 
		if (remupd_head == NULL){
			do_log(debuglogfile, debug, 1, "%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n",argv0);
			fprintf(stderr,"%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n", argv0);
		}

		if ((remupd_nfds = job_registry_updater_get_pollfd(remupd_head, &remupd_pollset)) < 0){
			do_log(debuglogfile, debug, 1, "%s: Cannot setup poll set for receiving data.\n",argv0);
    			fprintf(stderr,"%s: Cannot setup poll set for receiving data.\n", argv0);
		}
		if (remupd_pollset == NULL || remupd_nfds == 0){
			do_log(debuglogfile, debug, 1, "%s: No poll set available for receiving data.\n",argv0);
			fprintf(stderr,"%s: No poll set available for receiving data.\n",argv0);
		}
	
	}
	
	if( !nodmn ) daemonize();


	if( pidfile ){
		writepid(pidfile);
		free(pidfile);
	}
	
	rha=job_registry_init(registry_file, BY_BATCH_ID);
	if (rha == NULL){
		do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,registry_file);
		fprintf(stderr,"%s: Error initialising job registry %s :",argv0,registry_file);
		perror("");
	}
	
	if (remupd_conf != NULL){
		pthread_create(&RecUpdNetThd, NULL, (void *(*)(void *))ReceiveUpdateFromNetwork, (void *)NULL);
	
		if (job_registry_updater_setup_sender(remupd_conf->values,remupd_conf->n_values,0,&remupd_head_send) < 0){
			do_log(debuglogfile, debug, 1, "%s: Cannot set network sender(s) up for remote update\n",argv0);
			fprintf(stderr,"%s: Cannot set network sender(s) up for remote update \n",argv0);
       		}
		if (remupd_head_send == NULL){
			do_log(debuglogfile, debug, 1, "%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n",argv0);
			fprintf(stderr,"%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n", argv0);
		}
	}

	config_free(cha);
	
	for(;;){
		/* Purge old entries from registry */
		now=time(0);
		if(now - purge_time > 86400){
			if((rc=job_registry_purge(registry_file, now-purge_interval,0))<0){
				do_log(debuglogfile, debug, 1, "%s: Error purging job registry %s:%d\n",argv0,registry_file,rc);
                	        fprintf(stderr,"%s: Error purging job registry %s :",argv0,registry_file);
                	        perror("");

			}
			purge_time=time(0);
		}
		
		now=time(0);
		if(now - last_consistency_check > bupdater_consistency_check_interval){
			if(job_registry_check_index_key_uniqueness(rha,&first_duplicate)==JOB_REGISTRY_FAIL){
				do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s.\nJobid should be removed or registry directory should be removed.\n",argv0,first_duplicate);
               	        	fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s.\nJobid should be removed or registry directory should be removed.",argv0,first_duplicate);
 
			}
			last_consistency_check=time(0);
		}
		
		IntStateQuery();
		
		fd = job_registry_open(rha, "r");
		if (fd == NULL){
			do_log(debuglogfile, debug, 1, "%s: Error opening job registry %s\n",argv0,registry_file);
			fprintf(stderr,"%s: Error opening job registry %s :",argv0,registry_file);
			perror("");
			sleep(loop_interval);
			continue;
		}
		if (job_registry_rdlock(rha, fd) < 0){
			do_log(debuglogfile, debug, 1, "%s: Error read locking job registry %s\n",argv0,registry_file);
			fprintf(stderr,"%s: Error read locking job registry %s :",argv0,registry_file);
			perror("");
			sleep(loop_interval);
			continue;
		}
		job_registry_firstrec(rha,fd);
		fseek(fd,0L,SEEK_SET);
		
		first=TRUE;
		finalquery_start_date = time(0);
		
		while ((en = job_registry_get_next(rha, fd)) != NULL){

			if((bupdater_lookup_active_jobs(&bact,en->batch_id) != BUPDATER_ACTIVE_JOBS_SUCCESS) && en->status!=REMOVED && en->status!=COMPLETED){
			
				do_log(debuglogfile, debug, 3, "%s: bupdater_lookup_active_jobs returned: %d for jobid: %s\n",argv0,bupdater_lookup_active_jobs(&bact,en->batch_id),en->batch_id);

				confirm_time=atoi(en->updater_info);
				if(confirm_time==0){
					confirm_time=en->mdate;
				}
			
				/* Assign Status=4 and ExitStatus=999 to all entries that after alldone_interval are still not in a final state(3 or 4)*/
				if(now-confirm_time>alldone_interval){
					AssignFinalState(en->batch_id);
					free(en);
					continue;
				}
								
				if(en->status==IDLE && strlen(en->updater_info)>0){
					if (en->mdate < finalquery_start_date){
						finalquery_start_date=en->mdate;
					}
					do_log(debuglogfile, debug, 2, "%s: FinalStateQuery needed for jobid=%s with status=%d\n",argv0,en->batch_id,en->status);
					runfinal=TRUE;
				}else if((now-confirm_time>finalstate_query_interval) && (now > next_finalstatequery)){
					if (en->mdate < finalquery_start_date){
						finalquery_start_date=en->mdate;
					}
					do_log(debuglogfile, debug, 2, "%s: FinalStateQuery needed for jobid=%s with status=%d\n",argv0,en->batch_id,en->status);
					runfinal=TRUE;
				}
				
			
			}
			free(en);
		}
		
		if(runfinal_oldlogs){
			FinalStateQuery(0,1);
			runfinal_oldlogs=FALSE;
			runfinal=FALSE;
		}else if(runfinal){
			FinalStateQuery(finalquery_start_date,1);
			runfinal=FALSE;
		}
		fclose(fd);		
		sleep(loop_interval);
	}
	
	job_registry_destroy(rha);
	
	return 0;
	
}