Ejemplo n.º 1
0
int main(int argc, char **argv) 
{ 
	AllScoreModels model;

	int i;
	char ann_file[256];
	char out_file[256];
	char input_file[256];
	char inspect_results_file[256];
	char list_file[256];
	char model_file[256];
	char initial_model[256];
	char model_dir[256];
	char PTM_string[256];
	char mgf_out_dir[256];
	char neg_spec_list[256];
	char tag_string[64];
	char tag_suffix[64];
	
	bool got_input_file=false,got_model_file=false, got_list_file=false;
	bool got_model_dir=false, got_initial_model=false, got_PTM_string = false, got_neg_spec_list=false;
	bool prm_only=false;
	bool prm_norm=false;
	bool pmcsqs_only = false;
	bool sqs_only = false;
	bool got_filter_spectra = false;
	bool pmcsqs_and_prm = false;
	bool train_flag = false;
	bool correct_pm = false;
	bool use_spectrum_charge = false;
	bool use_spectrum_mz     = false;
	bool perform_filter		 = true;
	bool output_aa_probs	 = false;
	bool output_cumulative_probs = false;
	bool make_inspect_tags   = false;
	bool make_training_fa	 = false;
	bool test_tags			 = false;
	bool got_make_ann_mgf	 = false;
	bool got_make_training_mgf = false;
	bool got_rescore_inspect = false;
	bool got_recalibrate_inspect = false;
	bool got_make_peak_examples  = false;

	int start_train_idx=0;
	int end_train_idx = POS_INF;
	int specific_charge=-1;
	int specific_size=-1;
	int specific_region=-1;

	int specific_idx = -1;
	
	int file_start_idx =0;
	int tag_length = 0;
	int num_solutions = 20;
	int digest_type = TRYPSIN_DIGEST;
	mass_t train_tolerance;
	float min_pmcsqs_prob = -1.0;
	mass_t fragment_tolerance = -1.0;
	mass_t pm_tolerance = -1.0;
	float sqs_filter_thresh = 0.0;
	float min_filter_prob = 0.0;
	int   num_test_cases=-1;
	int	  num_training_spectra=-1;

	seedRandom(112233);
	strcpy(tag_suffix,"tags");

	// read command line arguments
	i=1;
	while (i<argc)
	{

		if (! strcmp(argv[i],"-make_ann_mgf"))
		{
			if (++i == argc)
				print_help("Missing file ann file!");

			strcpy(ann_file,argv[i]);	

			if (++i == argc)
				print_help("Missing file out file!");

			strcpy(out_file,argv[i]);	

			got_make_ann_mgf=true;
		}
		else
		if (! strcmp(argv[i],"-make_training_mgf"))
		{
			if (++i == argc)
				print_help("Missing file out file!");

			strcpy(out_file,argv[i]);	

			if (++i == argc)
				print_help("Missing num training spectra!");

			num_training_spectra = atoi(argv[i]);
			if (num_training_spectra<=0)
				print_help("Error: -make_training_mgf [out_file] [num spectra>0]\n");
			
			got_make_training_mgf=true;
		}
		else if (!strcmp(argv[i],"-file"))
		{
			if (++i == argc)
				print_help("Missing file name!");

			strcpy(input_file,argv[i]);
			got_input_file=true;
		}
		else
		if (!strcmp(argv[i],"-list"))
		{
			if (++i == argc)
				print_help("Missing list name!");

			strcpy(list_file,argv[i]);
			got_list_file=true;
		}
		else if  (!strcmp(argv[i],"-file_start_idx"))
		{
			if (++i == argc)
				print_help("Missing file start idx!");

			file_start_idx = atoi(argv[i]);
		}
		else if (!strcmp(argv[i],"-model")) 
		{
			if (++i == argc)
				print_help("Missing model name!");

			strcpy(model_file,argv[i]);
			got_model_file=true;
		}
		else if (! strcmp(argv[i],"-model_dir"))
		{
			if (++i == argc)
				print_help("Missing model dir name!");

			strcpy(model_dir,argv[i]);
			got_model_dir=true;
		}
		else if (! strcmp(argv[i],"-fragment_tolerance"))
		{
			if (++i == argc)
				print_help("Missing model dir name!");

			fragment_tolerance = atof(argv[i]);
			if (fragment_tolerance<0 || fragment_tolerance>0.75)
				print_help("Error: -fragment_toelerance should be 0-0.75\n");
		}
		else if (! strcmp(argv[i],"-pm_tolerance"))
		{
			if (++i == argc)
				print_help("Missing model dir name!");

			pm_tolerance = atof(argv[i]);
			if (pm_tolerance<0 || pm_tolerance>5.0)
				print_help("Error: -pm_toelerance should be 0-5.0\n");
		}
		else if  (!strcmp(argv[i],"-num_solutions"))
		{
			if (++i == argc)
				print_help("Missing number of solutions!");

			num_solutions = atoi(argv[i]);
			if (num_solutions<=0 || num_solutions> 2000)
				print_help("Error: -num_solutions should be 1-2000\n");
		}
		else if (!strcmp(argv[i],"-tag_length"))
		{
			if (++i == argc)
				print_help("Missing minimum length parameter!");

			tag_length = atoi(argv[i]);
			if (tag_length<3 || tag_length>6)
				print_help("Error: -tag_length value must be 3-6\n");

		}
		else if (!strcmp(argv[i],"-digest"))
		{
			if (++i == argc)
				print_help("Missing digest type parameter : NON_SPECIFIC, TRYPSIN\n");

			if (! strcmp(argv[i],"NON_SPECIFIC"))
			{
				digest_type = NON_SPECIFIC_DIGEST;
			}
			else if (! strcmp(argv[i],"TRYPSIN"))
			{
				digest_type = TRYPSIN_DIGEST;
			}
			else
			{
				printf("Error: bad digest type: %s\n",argv[i]);
				print_help("Supported digest types: NON_SPECIFIC, TRYPSIN.");
			}
		}
		else if (! strcmp(argv[i],"-use_spectrum_charge"))
		{
			use_spectrum_charge = true;
		}
		else if (! strcmp(argv[i],"-use_spectrum_mz"))
		{
			use_spectrum_mz = true;
		}
		else if (! strcmp(argv[i],"-no_quality_filter"))
		{
			perform_filter = false;
		}
		else if (! strcmp(argv[i],"-correct_pm"))
		{
			correct_pm = true;
		}
		else if (! strcmp(argv[i],"-prm")) 
		{
			prm_only = true;
		}
		else if (! strcmp(argv[i],"-prm_norm")) 
		{
			prm_norm = true;
			prm_only = true;
		}
		else if (! strcmp(argv[i],"-output_aa_probs"))
		{
			output_aa_probs=true;
		}
		else if (! strcmp(argv[i],"-output_cumulative_probs"))
		{
			output_cumulative_probs=true;
		}
		else if (! strcmp(argv[i],"-pmcsqs_only"))
		{
			pmcsqs_only = true;
		}
		else if (! strcmp(argv[i],"-sqs_only"))
		{
			sqs_only = true;
		}
		else if (! strcmp(argv[i],"-min_filter_prob"))
		{
			if (++i == argc)
				print_help("Missing minimum probability parmater after -min_filter_prob !\n");

			min_filter_prob = -1.0;
			min_filter_prob = atof(argv[i]);
			if (min_filter_prob<0.0 || min_filter_prob>=1.0 || argv[i][0] != '0')
			{
				print_help("The flag -min_filter_prob should be followed by a minimal probability value [0-1.0]\n");
				exit(1);
			}
		}
		else if ( ! strcmp(argv[i],"-filter_spectra"))
		{
			got_filter_spectra = true;
			if (++i == argc)
				print_help("Missing minimum probability parmater after -filter_spectra !\n");
			
			sqs_filter_thresh=atof(argv[i]);

			if (sqs_filter_thresh <0 || sqs_filter_thresh>1.0)
				print_help("Error: the sqs threshold should be in the range 0-1 (recommended below 0.1)\n");
			
			if (++i == argc)
				print_help("Missing output directory for MGF files (second argument after -filter_spectra)!\n");
		
			strcpy(mgf_out_dir,argv[i]);
		}
		else if (! strcmp(argv[i],"-specific_idx"))
		{
			if (++i == argc)
				print_help("Missing idx!");
			specific_idx=atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-train_model"))
		{
			train_flag = true;
			if (++i == argc)
				print_help("Missing training tolerance!");

			train_tolerance = atof(argv[i]);
			if (train_tolerance<0.001 || train_tolerance>1.0)
				print_help("Error: training tolerance should be in the range 0.001 - 1.0\n");
		}
		else if (! strcmp(argv[i],"-start_train_idx"))
		{
			if (++i == argc)
				print_help("Missing start_train_idx!");

			start_train_idx = atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-end_train_idx"))
		{
			if (++i == argc)
				print_help("end_train_idx!");

			end_train_idx = atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-specific_reigon_model"))
		{
			if (++i == argc)
				print_help("specific_reigon_model!");

			specific_charge = atoi(argv[i++]);
			specific_size	= atoi(argv[i++]);
			specific_region = atoi(argv[i]);

		}
		else if (! strcmp(argv[i],"-specific_charge"))
		{
			if (++i == argc)
				print_help("specific_charge!");

			specific_charge = atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-specific_size"))
		{
			if (++i == argc)
				print_help("specific_size!");

			specific_size = atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-initial_model"))
		{
			got_initial_model = true;
			if (++i == argc)
				print_help("Missing initial model name!");
			strcpy(initial_model,argv[i]);
		}
		else if (! strcmp(argv[i],"-neg_spec_list"))
		{
			got_neg_spec_list = true;
			if (++i == argc)
				print_help("Missing neg spec list!");
			strcpy(neg_spec_list,argv[i]);
		}
		else if (! strcmp(argv[i],"-PTMs"))
		{
			got_PTM_string = true;
			if (++i == argc)
				print_help("Missing PTM list!");
			strcpy(PTM_string,argv[i]);
		}
		else if (! strcmp(argv[i],"-inspect_tags"))
		{
			make_inspect_tags=true;
			if (++i == argc)
				print_help("inspect_tags!");

			strcpy(tag_string,argv[i]);
		}
		else if (! strcmp(argv[i],"-rescore_inspect"))
		{
			got_rescore_inspect = true;
			if (++i == argc)
				print_help("Missing results file!");

			strcpy(inspect_results_file,argv[i]);

			if (++i == argc)
				print_help("Missing new results file!");

			strcpy(out_file,argv[i]);
		}
		else if (! strcmp(argv[i],"-recalibrate_inspect"))
		{
			got_recalibrate_inspect = true;
			if (++i == argc)
				print_help("Missing results file!");

			strcpy(inspect_results_file,argv[i]);

			if (++i == argc)
				print_help("Missing new results file!");

			strcpy(out_file,argv[i]); 		
		}
		else if ( ! strcmp(argv[i],"-make_peak_examples"))
		{
			got_make_peak_examples=true;
		}
		else if (! strcmp(argv[i],"-make_training_fa"))
		{
			make_training_fa=true;
		}
		else if (! strcmp(argv[i],"-test_tags"))
		{
			test_tags=true;
			if (++i == argc)
				print_help("test_tags!");

			strcpy(tag_string,argv[i]);
		}
		else if (! strcmp(argv[i],"-num_test_cases"))
		{
			if (++i == argc)
				print_help("num_test_cases!");

			num_test_cases = atoi(argv[i]);
		}
		else if (! strcmp(argv[i],"-tag_suffix"))
		{
			if (++i == argc)
				print_help("tag suffix!");
			strcpy(tag_suffix,argv[i]);
		}
		else
		{
			printf("**********************************************************\n");
			printf("\nError: Unkown command line option: %s\n\n",argv[i]);
			print_help("");
			exit(0); 
		}
		i++;
	}


	if (! got_model_file) 
		print_help("Error: Missing model name!");


	if (!got_input_file && ! got_list_file)
		print_help("Error: missing input file (either -file or -list must be used).");

	Config *config = model.get_config();

	if (got_model_dir)
	{
		config->set_resource_dir(string(model_dir));
	}

	

	//////////////////////////////////////////////////////////////////
	// Model Training
	if (train_flag)
	{	
		if (got_initial_model)
		{
			model.read_model(initial_model);
			if (got_PTM_string)
				config->apply_selected_PTMs(PTM_string);
			model.read_rank_models(initial_model,true);
			model.read_cum_seq_prob_models(initial_model,true);
		}
		else
		{
			config->init_with_defaults();
			config->set_tolerance(train_tolerance);
			config->set_digest_type(digest_type);
			if (got_PTM_string)
				config->apply_selected_PTMs(PTM_string);
		}

		model.set_model_name(string(model_file));
	
		SpectraAggregator sa;
		if (! got_list_file)
		{
			if (got_input_file)
			{
		//		fm.init_from_mgf(config,input_file);
				sa.initializeFromSpectraFilePath(input_file, config);
			}
			else
			{
				printf("Must supply a list of annotated spectra for training!\n");
				exit(0);
			}
		}
		else
		{
		//	fm.init_from_list_file(config,list_file);
			sa.initializeFromTextFile(list_file, config);
		}
		
		
		model.trainModelsInStages(model_file, 
								  sa,
									train_tolerance, 
									start_train_idx, 
									end_train_idx,
									specific_charge, 
									specific_size, 
									specific_region,
									(got_neg_spec_list ? neg_spec_list : NULL));

	

		model.write_model();
		exit(0);
	}
	
	///////////////////////////////////////////////////////////////////
	// Model initializing (running some sort of de novo, need a model)
	// 
	const time_t start_time = time(NULL);

	cout << "PepNovo V3. Build " << build_name << endl;
	cout << "Copyright 2008, The Regents of the University of California. All Rights Reserved." << endl;
	cout << "Created by Ari Frank ([email protected])" << endl << endl;
	cout << "Initializing models (this might take a few seconds)... " << flush;

	// TODO: incorporate PTM line into the model reading and also the other model stuff below
	model.read_model(model_file,true); 
	if (got_PTM_string)
		config->apply_selected_PTMs(PTM_string);
	model.getPeptideCompositionAssigner().init_aa_translations();
	model.read_rank_models(model_file,true);
	model.read_cum_seq_prob_models(model_file,true);

	cout << "Done." << endl;

	config = model.get_config();
	config->set_digest_type(digest_type);

	if (fragment_tolerance>0)
		config->set_tolerance(fragment_tolerance);

	if (pm_tolerance>0)
		config->setPrecursorMassTolerance(pm_tolerance);

	if (correct_pm)
		config->set_need_to_estimate_pm(1);

	if (use_spectrum_mz)
		config->set_use_spectrum_mz(1);

	if (use_spectrum_charge)
		config->set_use_spectrum_charge(1);

	if (! perform_filter)
		config->set_filter_flag(0);

	if (config->get_pm_tolerance()<0.1)
		config->set_need_to_estimate_pm(0);

	cout << setprecision(4) << fixed;
	cout << "Fragment tolerance : " << config->getTolerance() << endl;
	cout << "PM tolernace       : " << config->get_pm_tolerance() << endl;
	cout << "PTMs considered    : " ;
	if (got_PTM_string)
	{
		cout << PTM_string << endl;
	}
	else
	{
		cout << "None" << endl;
	}
	


	///////////////////////////////////////////////////////////////////
	// Training fa
	if (make_training_fa)
	{
		make_denovo_training_fa(model,input_file);
		exit(0);
	}

	///////////////////////////////////////////////////////////////////
	// Inspect tags

	if (make_inspect_tags)
	{
		create_tag_file_for_inspect(model,input_file,tag_string,tag_suffix);
		exit(0);
	}

	if (test_tags)
	{
		benchmark_tags(model,list_file,tag_string,num_test_cases);
		exit(0);
	}


	////////////////////////////////////////////////////////////////////
	// Rescore InsPecT
	if (got_rescore_inspect)
	{
		PeptideRankScorer *db_score = (PeptideRankScorer *)model.get_rank_model_ptr(0);
		db_score->rescore_inspect_results(input_file,inspect_results_file,out_file);
		exit(0);
	}

	if (got_recalibrate_inspect)
	{
		cout << "Recalibrating delta scores in " << input_file << endl;
		PeptideRankScorer *db_score = (PeptideRankScorer *)model.get_rank_model_ptr(0);
		db_score->recalibrate_inspect_delta_scores(input_file,inspect_results_file,out_file);
		exit(0);
	}

	if (got_make_peak_examples)
	{
		cout << "Making peak examples " << input_file << endl;
		PeptideRankScorer *db_score = (PeptideRankScorer *)model.get_rank_model_ptr(0);
		//db_score->make_peak_table_examples(input_file);
		exit(0);
	}



	///////////////////////////////////////////////////////////////////
	// Make input file list
	vector<string> list_vector;
	if (got_list_file)
	{
		readListOfPaths(list_file, list_vector);
	}
	else
		list_vector.push_back(input_file);

	int correct_benchmark =0;
	int total_benchmark =0;
	int counter=0;

	if (got_make_training_mgf)
	{
	//	make_training_mgf(config,list_file,num_training_spectra,out_file);
		exit(0);
	}


	if (sqs_only)
	{
		PMCSQS_Scorer *pmcsqs = (PMCSQS_Scorer *)model.get_pmcsqs_ptr();
		if (! pmcsqs ||  ! pmcsqs->getIndInitializedSqs())
		{
			cout << "Error: no spectrum quality score (SQS) for this model!" << endl;
			exit(1);
		}
	}
	else
	if (got_filter_spectra ||  pmcsqs_only)
	{
		PMCSQS_Scorer *pmcsqs = (PMCSQS_Scorer *)model.get_pmcsqs_ptr();
		if (! pmcsqs || ! pmcsqs->getIndInitializedPmc() || ! pmcsqs->getIndInitializedSqs())
		{
			cout << "Error: no parent mass correction (PMC) and/or quality score (SQS) for this model!" << endl;
			exit(1);
		}
	}




	///////////////////////////////////////////////////////////////////
	// FILTER SPECTRA
	if (got_filter_spectra)
	{
		int num_written =0;
		int num_read = 0;
		PMCSQS_Scorer *pmcsqs = (PMCSQS_Scorer *)model.get_pmcsqs_ptr();

	//	pmcsqs->output_filtered_spectra_to_mgfs(config, list_vector, mgf_out_dir, sqs_filter_thresh, num_written, num_read);
		
		time_t curr_time = time(NULL);
		double elapsed_time = (curr_time - start_time);
		cout << "Processed " << list_vector.size() << " (" << num_read << " spectra)." << endl;
		cout << "Wrote " << num_written << " spectra to mgfs in " << mgf_out_dir << endl;
		cout << "Elapsed time " << fixed << elapsed_time << " seconds." << endl;
		return 0;
	}

	//////////////////////////////////////////////////////////////////
	// PRM
	if (prm_only)
	{
		

		perform_prm_on_list_of_files(model, list_vector, min_filter_prob, file_start_idx, prm_norm);
	//	prm_benchmark(model, list_vector, min_pmcsqs_prob, file_start_idx);

	//	FileManager fm;
	//	fm.init_from_list(config,list_vector);
	//	model.learn_prm_normalizer_values(fm);
	//	model.write_prm_normalizer_values();
		return 0;
	}

	if (fabs(config->get_aa2mass()[Cys]-103.0)<1)
	{
		cout << endl <<"*** Warning: searching with unmodified cystine, usually the PTM C+57 should be included ***" << endl << endl;
	}
	cout << endl;

	//////////////////////////////////////////////////////////////////
	// PMCSQS
	if (pmcsqs_only)
	{
	//	perform_pmcsqs_on_list_of_files(model, list_vector, file_start_idx);
		return 0;
	}
 
	//////////////////////////////////////////////////////////////////
	// SQS
	if (sqs_only)
	{
	//	perform_sqs_on_list_of_files(model, list_vector, file_start_idx);
		return 0;
	}  
	
	//////////////////////////////////////////////////////////////////
	// DENOVO AND TAGS

	if (tag_length<=0)
	{
	//	perform_denovo_on_list_of_files(model, list_vector, file_start_idx, num_solutions, 7, 16, 
	//		false, min_filter_prob, output_aa_probs,  output_cumulative_probs, cout);
		new_perform_denovo_on_list_of_files(model, list_vector, file_start_idx, num_solutions, 7, 16, 
			false, min_filter_prob, output_aa_probs,  output_cumulative_probs, cout);
	}
	else
	{
		perform_tags_on_list_of_files(model,list_vector,file_start_idx,num_solutions,tag_length,
			false, min_filter_prob, output_aa_probs, output_cumulative_probs, cout);	
	}
	

#ifdef WIN32
	system("pause");
#endif

	return 0;
}