Пример #1
0
int main(int argc, char* argv[]) {

	// set srand
	srand(1);
	
	CConsole::Initialize();
	ConfigurationSettings settings;
	
	string commandLine;
	for ( int i = 0; i < argc; ++i )
		commandLine = commandLine + argv[i] + " ";

	printf("------------------------------------------------------------------------------\n");
	printf("Mosaik"); CConsole::Red(); printf("Aligner"); CConsole::Reset();
	printf(" %u.%u.%u                                                %s\n", 
		MOSAIK_MAJOR_VERSION, MOSAIK_MINOR_VERSION, MOSAIK_BUILD_VERSION, MOSAIK_VERSION_DATE);
	printf("Michael Stromberg & Wan-Ping Lee  Marth Lab, Boston College Biology Department\n");
	printf("------------------------------------------------------------------------------\n\n");

	// =================================
	// configure the command line parser
	// =================================

	// set general info about the program
	COptions::SetProgramInfo("MosaikAligner", "pairwise aligns a MOSAIK read file", "-in <filename> -out <filename> -ia <filename>");

	// add the input/output options
	OptionGroup* pIoOpts = COptions::CreateOptionGroup("Input/output: (required)");
	COptions::AddValueOption("-ia",  "MOSAIK reference filename", "the input reference file",  "An input MOSAIK reference file",  settings.HasReferencesFilename, settings.ReferencesFilename, pIoOpts);
	COptions::AddValueOption("-in",  "MOSAIK read filename",      "the input read file",       "An input MOSAIK read file",       settings.HasReadsFilename,      settings.ReadsFilename,      pIoOpts);
	COptions::AddValueOption("-out", "MOSAIK alignment filename", "the output alignment file", "An output MOSAIK alignment file", settings.HasAlignmentsFilename, settings.AlignmentsFilename, pIoOpts);
	COptions::AddValueOption("-ibs", "MOSAIK reference filename", "enables colorspace to basespace conversion using the supplied BASESPACE reference archive",  "",  settings.HasBasespaceReferencesFilename, settings.BasespaceReferencesFilename, pIoOpts);
	COptions::AddValueOption("-annpe", "Neural network filename", "", "", settings.HasPeNeuralNetworkFilename, settings.PeNeuralNetworkFilename, pIoOpts);
	COptions::AddValueOption("-annse", "Neural network filename", "", "", settings.HasSeNeuralNetworkFilename, settings.SeNeuralNetworkFilename, pIoOpts);

	// add the essential options
	OptionGroup* pEssentialOpts = COptions::CreateOptionGroup("Essential parameters");
	COptions::AddValueOption("-a",  "algorithm",  "alignment algorithm: [fast, single, multi, all]",  "", settings.HasAlgorithm, settings.Algorithm, pEssentialOpts, DEFAULT_ALGORITHM);
	COptions::AddValueOption("-m",  "mode",       "alignment mode: [unique, all]",                     "", settings.HasMode,      settings.Mode,      pEssentialOpts, DEFAULT_MODE);
	COptions::AddValueOption("-hs", "hash size",  "hash size [4 - 32]",                                "", settings.HasHashSize,  settings.HashSize,  pEssentialOpts, DEFAULT_HASH_SIZE);

	// add the filtering options
	OptionGroup* pFilterOpts = COptions::CreateOptionGroup("Filtering");
	COptions::AddValueOption("-act",  "threshold",      "the alignment candidate threshold (length)", "", settings.EnableAlignmentCandidateThreshold, settings.AlignmentCandidateThreshold, pFilterOpts);
	//COptions::AddOption("-dh", "require at least two hash hits",                                          settings.EnableDoubleHashHits,                                                    pFilterOpts);
	COptions::AddValueOption("-ls",   "radius",          "enable local alignment search for PE reads", "", settings.HasLocalAlignmentSearchRadius,     settings.LocalAlignmentSearchRadius,  pFilterOpts);
	//COptions::AddValueOption("-lsh",  "mapping quality", "MQ threshold", "", settings.HasLocalAlignmentSearchHighMqThreshold, settings.LocalAlignmentSearchHighMqThreshold, pFilterOpts );
	//COptions::AddValueOption("-lsl",  "mapping quality", "MQ threshold; when the best MQ is higher than -lsh and the second best is lower than -lsl, local alignment search is enabled.", "", settings.HasLocalAlignmentSearchLowMqThreshold, settings.LocalAlignmentSearchLowMqThreshold, pFilterOpts );
	COptions::AddValueOption("-mhp",  "hash positions",  "the maximum # of positions stored per seed",      "", settings.LimitHashPositions,                settings.HashPositionThreshold,       pFilterOpts);
	COptions::AddValueOption("-mhr",  "hash regionss",   "the maximum # of regions for aligning",      "", settings.LimitHashRegions,                settings.HashRegionThreshold,       pFilterOpts);
	COptions::AddValueOption("-min",  "nucleotides",     "the minimum # of aligned nucleotides",      "", settings.CheckMinAlignment,                 settings.MinimumAlignment,            pFilterOpts);
	COptions::AddValueOption("-minp", "percent",         "the minimum alignment percentage [0.0 - 1.0]",                "", settings.CheckMinAlignmentPercent,          settings.MinimumAlignmentPercentage,  pFilterOpts);
	COptions::AddValueOption("-mm",   "mismatches",      "the # of mismatches allowed",                "", settings.CheckNumMismatches,                settings.NumMismatches,               pFilterOpts);
	COptions::AddValueOption("-mmp",  "threshold",       "the percentage of mismatches allowed [0.0 - 1.0]",      "", settings.CheckMismatchPercent,              settings.MismatchPercent,             pFilterOpts);
	COptions::AddOption(     "-ncg",  "not count gaps as mismatches", settings.NotCountGapAsMismatch, pFilterOpts);
	//COptions::AddOption("-mmal", "when enabled, unaligned portions of the read will not count as a mismatch", settings.UseAlignedLengthForMismatches,                                       pFilterOpts);

	// TODO: we need to move the alignment quality calculation up to ApplyReadFilters in order to make this option useable
	//COptions::AddValueOption("-aq",   "threshold",      "enable an alignment quality threshold",      "", settings.CheckAlignmentQuality,             settings.AlignmentQualityThreshold,   pFilterOpts);

	// add the performance options
	OptionGroup* pPerformanceOpts = COptions::CreateOptionGroup("Performance");
	COptions::AddValueOption("-p",  "processors", "uses the specified number of processors", "", settings.HasNumThreads, settings.NumThreads, pPerformanceOpts);
	COptions::AddValueOption("-bw", "bandwidth",  "specifies the Smith-Waterman bandwidth", "",  settings.HasBandwidth,  settings.Bandwidth,  pPerformanceOpts, DEFAULT_BANDWIDTH);
	COptions::AddOption("-lm",                    "enable low-memory functions",              settings.UseLowMemory,                                 pPerformanceOpts);

	// add the jump database options
	OptionGroup* pJumpOpts = COptions::CreateOptionGroup("Jump database");
	COptions::AddValueOption("-j",  "filename stub", "uses the specified jump database",     "", settings.UseJumpDB,                 settings.JumpFilenameStub, pJumpOpts);
	//COptions::AddValueOption("-jc", "# of hashes",   "caches the most recently used hashes", "", settings.HasJumpCacheMemory,        settings.JumpCacheMemory,  pJumpOpts);
	COptions::AddOption("-kd",                       "keeps the keys file on disk",              settings.KeepJumpKeysOnDisk,                                 pJumpOpts);
	COptions::AddOption("-pd",                       "keeps the positions file on disk",         settings.KeepJumpPositionsOnDisk,                            pJumpOpts);
	COptions::AddValueOption("-sref",  "reference prefixes", "specifies the prefixes of special references", "", settings.HasSpecialReferencePrefix, settings.SpecialReferencePrefix, pJumpOpts);
	COptions::AddValueOption("-srefn", "hashes", "the maximum special hashes", "", settings.HasSpecialHashCount, settings.SpecialHashCount, pJumpOpts);
	
	// add the reporting options
	OptionGroup* pReportingOpts = COptions::CreateOptionGroup("Reporting");
	COptions::AddValueOption("-statmq", "threshold", "enable mapping quality threshold for statistical map [0 - 255]", "", settings.HasStatMappingQuality, settings.StatMappingQuality, pReportingOpts);
	COptions::AddOption("-omi",         "output chrmosome ids and positions of multiply mapped alignments in the multiple.bam", settings.OutputMultiplyIncomplete, pReportingOpts);
	COptions::AddOption("-om",          "output complete multiply mapped alignments in the multiple.bam", settings.OutputMultiplyComplete, pReportingOpts);

	COptions::AddOption("-zn",          "output zn tags",settings.EnableZnTag, pReportingOpts);
	//COptions::AddValueOption("-rur", "FASTQ filename", "stores unaligned reads in a FASTQ file", "", settings.RecordUnalignedReads, settings.UnalignedReadsFilename, pReportingOpts);

	// add the pairwise alignment scoring options
	OptionGroup* pPairwiseOpts = COptions::CreateOptionGroup("Pairwise Alignment Scores");
	COptions::AddValueOption("-ms",   "match score",        "the match score",             "", settings.HasMatchScore,                settings.MatchScore,                pPairwiseOpts, CPairwiseUtilities::MatchScore);
	COptions::AddValueOption("-mms",  "mismatch score",     "the mismatch score",          "", settings.HasMismatchScore,             settings.MismatchScore,             pPairwiseOpts, CPairwiseUtilities::MismatchScore);
	COptions::AddValueOption("-gop",  "gap open penalty",   "the gap open penalty",        "", settings.HasGapOpenPenalty,            settings.GapOpenPenalty,            pPairwiseOpts, CPairwiseUtilities::GapOpenPenalty);
	COptions::AddValueOption("-gep",  "gap extend penalty", "the gap extend penalty",      "", settings.HasGapExtendPenalty,          settings.GapExtendPenalty,          pPairwiseOpts, CPairwiseUtilities::GapExtendPenalty);
	COptions::AddValueOption("-hgop", "gap open penalty",   "enables the homopolymer gop", "", settings.HasHomoPolymerGapOpenPenalty, settings.HomoPolymerGapOpenPenalty, pPairwiseOpts, CPairwiseUtilities::HomoPolymerGapOpenPenalty);

	// add interface options
	OptionGroup* pInterface = COptions::CreateOptionGroup("Interface Options");
	COptions::AddOption("-quiet",  "disable progress bars and counters", settings.IsQuietMode, pInterface);

	// parse the current command line
	COptions::Parse(argc, argv);

	// =============================
	// check for missing information
	// =============================

	bool foundError = false;
	ostringstream errorBuilder;
	const string ERROR_SPACER(7, ' ');

	//if(settings.EnableAlignmentCandidateThreshold && settings.EnableDoubleHashHits) {
	//	errorBuilder << ERROR_SPACER << "Please specify either an alignment candidate threshold (-act) or double-hash hits (-dh). Double-hash hits are equivalent to '-act <hash size + 1>." << endl;
	//	foundError = true;
	//}

	if((settings.HasJumpCacheMemory || settings.KeepJumpKeysOnDisk || settings.KeepJumpPositionsOnDisk) && !settings.UseJumpDB) {
		errorBuilder << ERROR_SPACER << "Jump database settings were specified, but the jump database was not explicitly chosen. Please use the -j parameter." << endl;
		foundError = true;
	}

	if(settings.UseJumpDB) {
		string keysFilename      = settings.JumpFilenameStub + "_keys.jmp";
		string metaFilename      = settings.JumpFilenameStub + "_meta.jmp";
		string positionsFilename = settings.JumpFilenameStub + "_positions.jmp";

		CFileUtilities::CheckFile(keysFilename.c_str(), true);
		CFileUtilities::CheckFile(metaFilename.c_str(), true);
		CFileUtilities::CheckFile(positionsFilename.c_str(), true);

		if(!settings.KeepJumpKeysOnDisk && !settings.KeepJumpPositionsOnDisk && settings.HasJumpCacheMemory) 
			settings.HasJumpCacheMemory = false;
	}

	if(!settings.CheckNumMismatches && !settings.CheckMismatchPercent ) {
		//settings.CheckNumMismatches = true;
		settings.CheckMismatchPercent = true;
	}

	// figure out which algorithm to use
	CSequenceUtilities::LowercaseSequence(settings.Algorithm);
	CAlignmentThread::AlignerAlgorithmType algorithmType  = CAlignmentThread::AlignerAlgorithm_ALL;

	if(settings.Algorithm == "fast")        algorithmType = CAlignmentThread::AlignerAlgorithm_FAST;
	else if(settings.Algorithm == "single") algorithmType = CAlignmentThread::AlignerAlgorithm_SINGLE;
	else if(settings.Algorithm == "multi")  algorithmType = CAlignmentThread::AlignerAlgorithm_MULTI;
	else if(settings.Algorithm == "all")    algorithmType = CAlignmentThread::AlignerAlgorithm_ALL;
	else {
		errorBuilder << ERROR_SPACER << "Unknown algorithm type. Please choose between 'fast', 'single', 'multi', or 'all'. The default value is '" << DEFAULT_ALGORITHM << "'." << endl;
		foundError = true;
	}

	// set the hash positions threshold
	if ( ( settings.LimitHashPositions ) && ( settings.HashPositionThreshold == 0 ) )
		settings.LimitHashPositions = false;

	if ( settings.LimitHashPositions ) {

		// make sure we're using the all algorithm
		if ( algorithmType != CAlignmentThread::AlignerAlgorithm_ALL ) {
			errorBuilder << ERROR_SPACER << "Setting the hash positions threshold is only applicable when using the 'all' algorithm. This can be set by using the '-a all' parameter. Or turn off the hash positions threshold by using '-mhp 0'" << endl;
			foundError = true;
		}

		// won't be the case
		//if ( settings.HashPositionThreshold == 0 ) {
		//	errorBuilder << ERROR_SPACER << "The hash position threshold should be larger than 0. Use the -mhp parameter to change the hash position threshold." << endl;
		//	foundError = true;
		//}
	}

	if ( settings.LimitHashRegions ) {
		if ( settings.HashRegionThreshold == 0 ) {
			errorBuilder << ERROR_SPACER << "The hash region threshold should be larger than 0. Use the -mhr parameter to change the hash position threshold." << endl;
			foundError = true;
		}
	}

	// figure out which alignment mode to use
	CSequenceUtilities::LowercaseSequence(settings.Algorithm);
	CAlignmentThread::AlignerModeType modeType = CAlignmentThread::AlignerMode_ALL;

	if(settings.Mode == "unique")     modeType = CAlignmentThread::AlignerMode_UNIQUE;
	else if(settings.Mode == "all")   modeType = CAlignmentThread::AlignerMode_ALL;
	else {
		errorBuilder << ERROR_SPACER << "Unknown mode type. Please choose between 'unique' or 'all'. The default value is '" << DEFAULT_MODE << "'." << endl;
		foundError = true;
	}

	// set the maximum mismatch percentage
	if(settings.CheckMismatchPercent) {

		// make sure our value is within bounds
		if((settings.MismatchPercent < 0.0) || (settings.MismatchPercent > 1.0)) {
			errorBuilder << ERROR_SPACER << "The maximum mismatch percentage should be between 0.0 and 1.0." << endl;
			foundError = true;
		}

		CPairwiseUtilities::MaxMismatchPercent       = settings.MismatchPercent;
		CPairwiseUtilities::UseMismatchPercentFilter = true;
	}

	// set the minimum aligned percentage
	if(settings.CheckMinAlignmentPercent) {

		// make sure our value is within bounds
		if((settings.MinimumAlignmentPercentage < 0.0) || (settings.MinimumAlignmentPercentage > 1.0)) {
			errorBuilder << ERROR_SPACER << "The minimum alignment percentage should be between 0.0 and 1.0." << endl;
			foundError = true;
		}

		// assign the minimum percentage alignment
		CPairwiseUtilities::MinPercentAlignment          = settings.MinimumAlignmentPercentage;
		CPairwiseUtilities::UseMinAlignmentPercentFilter = true;
	}

	// check the minimum alignment quality
	//if(settings.CheckAlignmentQuality && (settings.AlignmentQualityThreshold > 99)) {
	//	errorBuilder << ERROR_SPACER << "The alignment quality threshold should be between 0 and 99." << endl;
	//	foundError = true;
	//}

	// set the hash size
	if(settings.HasHashSize && ((settings.HashSize < MIN_HASH_SIZE) || (settings.HashSize > MAX_HASH_SIZE))) {
		errorBuilder << ERROR_SPACER << "The hash size should be between " << MIN_HASH_SIZE << " and " << MAX_HASH_SIZE << ". The default value is " << DEFAULT_HASH_SIZE << "." << endl;
		foundError = true;
	}

	// set the number of threads
	if(settings.HasNumThreads && (settings.NumThreads < 1)) {
		errorBuilder << ERROR_SPACER << "At least one processor should be specified. Use the -p parameter to change the number of desired processors." << endl;
		foundError = true;
	}

	// test if the specified input files exist and are in the right format
	SequencingTechnologies seqTech;
	ReadStatus readStatus;
	MosaikReadFormat::CReadReader::CheckFile(settings.ReadsFilename, seqTech, readStatus, true);
	MosaikReadFormat::CReferenceSequenceReader::CheckFile(settings.ReferencesFilename, true);
	
	// set defaults of act, ls and bw
	{
		MosaikReadFormat::CReadReader reader;
		reader.Open( settings.ReadsFilename );
		uint64_t nReads, nBases;
		nReads = reader.GetNumReads();
		nBases = reader.GetNumBases();
		// sanity checker
		if ( nBases < nReads )
			errorBuilder << ERROR_SPACER << "The number of reads is smaller than the number of total bases in " << settings.ReadsFilename;

		double readLength = nBases/nReads;
		if ( ( readStatus & RS_PAIRED_END_READ ) != 0 )
			readLength /= 2;
		
		// act
		if ( settings.EnableAlignmentCandidateThreshold ) {
			if ( settings.AlignmentCandidateThreshold == 0 ) {
				settings.EnableAlignmentCandidateThreshold = false;
				settings.AlignmentCandidateThreshold       = 0;
			}
		} else {
			if ( seqTech == ST_454 ) {
				settings.EnableAlignmentCandidateThreshold = true;
				settings.AlignmentCandidateThreshold       = ( readLength > 350.0 ) ? 55 : 26;
			} else {
				settings.EnableAlignmentCandidateThreshold = true;
				settings.AlignmentCandidateThreshold       = (unsigned char)floor( 13 + ( readLength / 5 ) );
			}
		}

		// bw
		if ( settings.HasBandwidth ) {
			if ( settings.Bandwidth == 0 ) {
				settings.HasBandwidth = false;
				settings.Bandwidth    = 0;
			}
		} else {
			if ( settings.CheckNumMismatches ) {
				settings.HasBandwidth = true;
				settings.Bandwidth    = static_cast<unsigned int>(ceil( 2.5 * settings.NumMismatches ));
				if ( ( settings.Bandwidth % 2 ) != 1 ) ++settings.Bandwidth;
			} else if ( settings.CheckMismatchPercent ) {
				settings.HasBandwidth = true;
				settings.Bandwidth    = static_cast<unsigned int>(ceil( 2.5 * settings.MismatchPercent * ceil(readLength) ));
				if ( ( settings.Bandwidth % 2 ) != 1 ) ++settings.Bandwidth;
			} else {
				settings.HasBandwidth = false;
				settings.Bandwidth    = 0;
			}
		}

		// ls
		// only for paired-end data and the -mfl setting in the given archive != 0
		MosaikReadFormat::ReadGroup readGroup = reader.GetReadGroup();
		if ( settings.HasLocalAlignmentSearchRadius ) {
			if ( settings.LocalAlignmentSearchRadius == 0 ) {
				settings.HasLocalAlignmentSearchRadius = false;
				settings.LocalAlignmentSearchRadius    = 0;
			} else {
				if ( ( readStatus & RS_PAIRED_END_READ ) == 0 ) {
					cout << "WARNING: Local alignment search only works for paired-end data." << endl;
					settings.HasLocalAlignmentSearchRadius = false;
					settings.LocalAlignmentSearchRadius    = 0;
				}
			}
		} else {
			if ( ( readStatus & RS_PAIRED_END_READ ) != 0 ) {
				if ( readGroup.MedianFragmentLength == 0 ) {
					cout << "WARNING: Paired-end data is detected, but the median fragment length is not specified." << endl; 
					cout << "         Accordingly, local alignment search is not enabled." << endl;
					cout << "         The median fragment length (-mfl parameter) can be specified in MosaikBuild.\n" << endl;
				} else {
					settings.HasLocalAlignmentSearchRadius = true;
					settings.LocalAlignmentSearchRadius    = readGroup.MedianFragmentLength;
				}
			}
		}

		if ( !settings.HasLocalAlignmentSearchRadius && settings.HasLocalAlignmentSearchHighMqThreshold )
			cout << "WARNING: -lsh is enabled but -ls is not." << endl;

		if ( !settings.HasLocalAlignmentSearchRadius && settings.HasLocalAlignmentSearchLowMqThreshold )
			cout << "WARNING: -lsl is enabled but -ls is not." << endl;

		if ( settings.HasLocalAlignmentSearchHighMqThreshold || settings.HasLocalAlignmentSearchLowMqThreshold ) {
			if ( settings.LocalAlignmentSearchLowMqThreshold > settings.LocalAlignmentSearchHighMqThreshold ) {
				errorBuilder << ERROR_SPACER << "The high MQ threshold (-lsh) must be larger than low MQ threshold (-lsl)." << endl;
				foundError = true;
			}
		}

		// Note: These two statements are always tru
		//if (settings.LocalAlignmentSearchHighMqThreshold > 255) {
		//	errorBuilder << ERROR_SPACER << "The range of high MQ threshold (-lsh) is 0-255." << endl;
		//	foundError = true;
		//}

		//if ( settings.LocalAlignmentSearchLowMqThreshold > 255 ) {
		//	errorBuilder << ERROR_SPACER << "The range of low MQ threshold (-lsl) is 0-255." << endl;
		//	foundError = true;
		//}

		reader.Close();
	}

	// set the Smith-Waterman bandwidth
	if ( settings.HasBandwidth && ( ( settings.Bandwidth % 2 ) != 1  ) ) {
		errorBuilder << ERROR_SPACER << "The bandwidth must be an odd number. Use the -bw parameter to change the bandwidth." << endl;
		foundError = true;
	}

	switch(seqTech) {
		case ST_454:
			if(!settings.HasHomoPolymerGapOpenPenalty) {
				CPairwiseUtilities::UseHomoPolymerGapOpenPenalty = true;
				settings.HasHomoPolymerGapOpenPenalty            = true;
				settings.HomoPolymerGapOpenPenalty               = CPairwiseUtilities::HomoPolymerGapOpenPenalty;
			}
			break;
		case ST_SOLID:
			settings.EnableColorspace = true;
			break;
	}

	// check if we have a supplied basespace reference archive when aligning a SOLiD read archive
	if(seqTech == ST_SOLID) {

		// force -ibs to be basespace and -ia to be colorspace if aligning SOLiD reads
		if(settings.HasBasespaceReferencesFilename && settings.HasReferencesFilename) {
			MosaikReadFormat::CReferenceSequenceReader csRef, bsRef;

			// retrieve the colorspace status
			csRef.Open(settings.ReferencesFilename);
			ReferenceSequenceStatus csStatus = csRef.GetStatus();
			csRef.Close();

			// retrieve the basespace status
			bsRef.Open(settings.BasespaceReferencesFilename);
			ReferenceSequenceStatus bsStatus = bsRef.GetStatus();
			bsRef.Close();

			if(csStatus != REF_COLORSPACE) {
				errorBuilder << ERROR_SPACER << "Expected to find a colorspace reference sequence archive (" << settings.ReferencesFilename << ") with the -ia parameter, but found a basespace reference sequence archive." << endl;
				foundError = true;
			}

			if(bsStatus != REF_UNKNOWN) {
				errorBuilder << ERROR_SPACER << "Expected to find a basespace reference sequence archive (" << settings.BasespaceReferencesFilename << ") with the -ibs parameter, but found a colorspace reference sequence archive." << endl;
				foundError = true;
			}
		} 

		if(!settings.HasBasespaceReferencesFilename)  { 
			errorBuilder << ERROR_SPACER << "When aligning SOLiD read archives, both a colorspace reference archive AND a basespace reference archive are required. Use the -ibs parameter to supply the basespace reference archive filename." << endl;
			foundError = true;
		}

		//if( settings.UseLowMemory ) {
		//	errorBuilder << ERROR_SPACER << "The low-memory algorithm does not support for SOLiD reads yet." << endl;
		//	foundError = true;
		//}
	}

	// files for neural network for mapping quality calculation
	if (!settings.HasPeNeuralNetworkFilename) {
		errorBuilder << ERROR_SPACER << "An input paired-end neural-network file was not specified. Please use the -annpe parameter." << endl
		             << ERROR_SPACER << "   The file is on src/networkFile/2.1.26.pe.100.0065.ann." << endl;
		foundError = true;
	} else { // test the exietence of the file
		// doesn't show error message
		CFileUtilities::CheckFile(settings.PeNeuralNetworkFilename.c_str(), true);
	}
	if (!settings.HasSeNeuralNetworkFilename) {
		errorBuilder << ERROR_SPACER << "An input single-end neural-network file was not specified. Please use the -annse parameter." << endl
		             << ERROR_SPACER << "   The file is on src/networkFile/2.1.26.se.100.005.ann." << endl;
		foundError = true;
	} else { // test the exietence of the file
		CFileUtilities::CheckFile(settings.SeNeuralNetworkFilename.c_str(), true);
	}

	if (settings.OutputMultiplyIncomplete && settings.OutputMultiplyComplete) {
		foundError = true;
		errorBuilder << ERROR_SPACER << "-omi and -om are incompatible." << endl;
	}

	// print the errors if any were found
	if(foundError) {

		CConsole::Red();
		printf("ERROR: Some problems were encountered when parsing the command line options:\n");
		CConsole::Reset();

		printf("%s\n", errorBuilder.str().c_str());
		printf("For a complete list of command line options, type \"%s -h\"\n", argv[0]);
		exit(1);
	}

	// ===================================================
	// Parse configuration strings and set class variables
	// ===================================================

	// set the minimum alignment quality
	//if(settings.CheckAlignmentQuality) {
	//	CPairwiseUtilities::MinAlignmentQuality          = settings.AlignmentQualityThreshold;
	//	CPairwiseUtilities::UseMinAlignmentQualityFilter = true;
	//}

	// set the maximum number of mismatches
	if(settings.CheckNumMismatches) {
		CPairwiseUtilities::MaxNumMismatches  = settings.NumMismatches;
		CPairwiseUtilities::UseMismatchFilter = true;
	}

	// set the minimum number of aligned nucleotides
	if(settings.CheckMinAlignment) {
		CPairwiseUtilities::MinAlignment          = settings.MinimumAlignment;
		CPairwiseUtilities::UseMinAlignmentFilter = true;
	}

	// set the Smith-Waterman scores
	if(settings.HasMatchScore || settings.HasMismatchScore || settings.HasGapOpenPenalty || settings.HasGapExtendPenalty) {
		if(settings.HasMatchScore)       CPairwiseUtilities::MatchScore       = settings.MatchScore;
		if(settings.HasMismatchScore)    CPairwiseUtilities::MismatchScore    = settings.MismatchScore;
		if(settings.HasGapOpenPenalty)   CPairwiseUtilities::GapOpenPenalty   = settings.GapOpenPenalty;
		if(settings.HasGapExtendPenalty) CPairwiseUtilities::GapExtendPenalty = settings.GapExtendPenalty;
	}

	// set the Smith-Waterman h**o-polymer gap open penalty
	if(settings.HasHomoPolymerGapOpenPenalty) {
		CPairwiseUtilities::HomoPolymerGapOpenPenalty    = settings.HomoPolymerGapOpenPenalty;
		CPairwiseUtilities::UseHomoPolymerGapOpenPenalty = true;
	}

	// show warning message about unique alignments
	if(((readStatus & RS_PAIRED_END_READ) != 0) && (modeType != CAlignmentThread::AlignerMode_ALL)) {
		cout << "WARNING: A paired-end read archive was detected and the aligner mode (-m parameter) was not set to 'all'. Paired-end resolution in MosaikSort will be limited to unique vs unique reads.\n" << endl << endl;
	}

	// show warning messages dealing with the local alignment search radius
	if(settings.HasLocalAlignmentSearchRadius) {

		// show the warning message if we have a SE read archive
		if( ( readStatus & RS_SINGLE_END_READ ) != 0 ) {
			cout << "WARNING: A single-end read archive was detected and the local alignment search was enabled. Local alignment search only works with paired-end reads.\n" << endl << endl;
			settings.HasLocalAlignmentSearchRadius = false;
		} else { 

			// show the warning message if we have a PE read archive with no mean fragment length
			MosaikReadFormat::CReadReader in;
			in.Open(settings.ReadsFilename);
			MosaikReadFormat::ReadGroup readGroup = in.GetReadGroup();
			in.Close();

			if( readGroup.MedianFragmentLength == 0 ) {
				cout << "WARNING: Local alignment search only works when the median fragment length (-mfl parameter) has been specified in MosaikBuild.\n" << endl << endl;
				settings.HasLocalAlignmentSearchRadius = false;
			}
		}
	}

	// show warning message about using the local alignment search with SE read archives
	if(((readStatus & RS_SINGLE_END_READ) != 0) && settings.HasLocalAlignmentSearchRadius) {
		cout << "WARNING: A single-end read archive was detected and the local alignment search was enabled. Local alignment search only works with paired-end reads.\n" << endl << endl;
		settings.HasLocalAlignmentSearchRadius = false;
	}

	// start benchmarking
	CBenchmark bench;
	bench.Start();

	// create our aligner
	CMosaikAligner ma(settings.HashSize, algorithmType, modeType, settings.NumThreads, commandLine );

	// ===============
	// enable features
	// ===============

	// set zn tag reporting
	if (settings.EnableZnTag) ma.EnableZnTag();

	// set neural network filename
	if (settings.HasPeNeuralNetworkFilename) ma.SetPeNeuralNetworkFilename(settings.PeNeuralNetworkFilename);
	if (settings.HasSeNeuralNetworkFilename) ma.SetSeNeuralNetworkFilename(settings.SeNeuralNetworkFilename);

	// output multiply mapped alignments
	ma.OutputMultiply(settings.OutputMultiplyIncomplete, settings.OutputMultiplyComplete);
	
	// enable quiet mode
	if (settings.IsQuietMode) ma.SetQuietMode();

	// enable the hash positions threshold
	if(settings.LimitHashPositions) ma.EnableHashPositionThreshold(settings.HashPositionThreshold);

	// enable the hash region threshold
	if(settings.LimitHashRegions) ma.EnableHashRegionThreshold(settings.HashRegionThreshold);

	// enable the alignment candidate threshold
	if(settings.EnableAlignmentCandidateThreshold) ma.EnableAlignmentCandidateThreshold(settings.AlignmentCandidateThreshold);

	// enable unaligned read reporting if specified
	//if(settings.RecordUnalignedReads) ma.EnableUnalignedReadReporting(settings.UnalignedReadsFilename);

	// enable colorspace (SOLiD)
	if(settings.EnableColorspace) ma.EnableColorspace(settings.BasespaceReferencesFilename);

	// enable double-hash hits
	//if(settings.EnableDoubleHashHits) ma.EnableAlignmentCandidateThreshold(settings.HashSize + 1);

	// enable entire read length mismatch checking
	if(settings.UseAlignedLengthForMismatches) ma.UseAlignedReadLengthForMismatchCalculation();

	// enable the jump database
	if(settings.UseJumpDB) ma.EnableJumpDB(settings.JumpFilenameStub, settings.JumpCacheMemory, !settings.KeepJumpKeysOnDisk, !settings.KeepJumpPositionsOnDisk);

	// enable the local alignment search
	if(settings.HasLocalAlignmentSearchRadius) ma.EnableLocalAlignmentSearch(settings.LocalAlignmentSearchRadius);

	// set the Smith-Waterman bandwidth
	if(settings.HasBandwidth) ma.EnableBandedSmithWaterman(settings.Bandwidth);

	// enable low-memory algorithm
	if(settings.UseLowMemory) ma.EnableLowMemory();

	// not count gasp as mismatches
	if(settings.NotCountGapAsMismatch) ma.NotCountGapAsMismatch();

	// enables special references checker
	if(settings.HasSpecialReferencePrefix) {
		ma.EnableSpecialReference(settings.SpecialReferencePrefix);
		if ( settings.HasSpecialHashCount )
			ma.SetSpecialHashCount(settings.SpecialHashCount);
		else
			ma.SetSpecialHashCount(DEFAULT_SPECIAL_HASHES);
	}

	// set the trigger condition of local search
	if ( settings.HasLocalAlignmentSearchHighMqThreshold || settings.HasLocalAlignmentSearchLowMqThreshold ) 
		ma.SetLocalAlignmentSearchMqThreshold ( settings.LocalAlignmentSearchHighMqThreshold, settings.LocalAlignmentSearchLowMqThreshold );

	// set the mapping quality threshold for stat map
	if ( settings.HasStatMappingQuality ) ma.SetStatMappingQuality( settings.StatMappingQuality );



	// =============
	// set filenames
	// =============

	ma.SetFilenames(settings.ReadsFilename, settings.AlignmentsFilename, settings.ReferencesFilename);

	// ====================
	// echo enabled options
	// ====================

	cout << "- Using the following alignment algorithm: ";
	switch(algorithmType) {
		case CAlignmentThread::AlignerAlgorithm_FAST:
			cout << "single position (fast)" << endl;
			break;
		case CAlignmentThread::AlignerAlgorithm_SINGLE:
			cout << "single position" << endl;
			break;
		case CAlignmentThread::AlignerAlgorithm_MULTI:
			cout << "multiple position" << endl;
			break;
		case CAlignmentThread::AlignerAlgorithm_ALL:
			cout << "all positions" << endl;
			break;
		default:
			cout << "ERROR: Unknown alignment algorithm specified." << endl;
			exit(1);
			break;
	}

	cout << "- Using the following alignment mode: ";
	switch(modeType) {
		case CAlignmentThread::AlignerMode_ALL:
			cout << "aligning reads to all possible locations" << endl;
			break;
		case CAlignmentThread::AlignerMode_UNIQUE:
			cout << "only aligning unique reads" << endl;
			break;
		default:
			cout << "ERROR: Unknown alignment mode specified." << endl;
			exit(1);
			break;
	}

	//if(settings.CheckAlignmentQuality)    cout << "- Using an alignment quality threshold of " << CPairwiseUtilities::MinAlignmentQuality << endl;
	if(settings.CheckNumMismatches)       cout << "- Using a maximum mismatch threshold of " << CPairwiseUtilities::MaxNumMismatches << endl;
	if(settings.CheckMismatchPercent)     cout << "- Using a maximum mismatch percent threshold of " << CPairwiseUtilities::MaxMismatchPercent << endl;
	if(settings.CheckMinAlignment)        cout << "- Using a minimum alignment threshold of " << CPairwiseUtilities::MinAlignment << endl;
	if(settings.CheckMinAlignmentPercent) cout << "- Using a minimum percent alignment threshold of " << CPairwiseUtilities::MinPercentAlignment << endl;
	if(settings.HasHashSize)              cout << "- Using a hash size of " << (unsigned int)settings.HashSize << endl;
	//if(settings.EnableDoubleHashHits)     cout << "- Using double-hash hits" << endl;
	if(settings.EnableColorspace)         cout << "- Aligning in colorspace (SOLiD)" << endl;
	if(settings.HasNumThreads)            cout << "- Using " << (short)settings.NumThreads << (settings.NumThreads > 1 ? " processors" : " processor") << endl;
	if(settings.HasBandwidth)             cout << "- Using a Smith-Waterman bandwidth of " << settings.Bandwidth << endl;

	if(settings.EnableAlignmentCandidateThreshold) 
		cout << "- Using an alignment candidate threshold of " << (unsigned short)settings.AlignmentCandidateThreshold << "bp." << endl;

	if(settings.HasLocalAlignmentSearchRadius)
		cout << "- Using a local alignment search radius of " << settings.LocalAlignmentSearchRadius << "bp." << endl;

	if(settings.LimitHashPositions) cout << "- Setting hash position threshold to " << settings.HashPositionThreshold << endl;

	if(settings.UseJumpDB) {
		cout << "- Using a jump database for hashing";
		if(settings.HasJumpCacheMemory) cout << " with a " << settings.JumpCacheMemory << " element cache";
		cout << ".";

		if(!settings.KeepJumpKeysOnDisk && !settings.KeepJumpPositionsOnDisk)     cout << " Storing keys & positions in memory.";
		else if(!settings.KeepJumpKeysOnDisk && settings.KeepJumpPositionsOnDisk) cout << " Storing keys in memory.";
		else if(settings.KeepJumpKeysOnDisk && !settings.KeepJumpPositionsOnDisk) cout << " Storing positions in memory.";
		cout << endl;
	}

	//if(settings.RecordUnalignedReads)
	//	cout << "- Reporting all unaligned reads to " << settings.UnalignedReadsFilename << "." << endl;

	if(settings.HasHomoPolymerGapOpenPenalty) 
		cout << "- Using a h**o-polymer gap open penalty of " << CPairwiseUtilities::HomoPolymerGapOpenPenalty << endl;

	if(settings.HasMatchScore || settings.HasMismatchScore || settings.HasGapOpenPenalty || settings.HasGapExtendPenalty) 
		cout << "- Updating Smith-Waterman scoring scheme (match, mismatch, gap open, gap extend): (" <<
		CPairwiseUtilities::MatchScore << ", " << CPairwiseUtilities::MismatchScore << ", " <<
		CPairwiseUtilities::GapOpenPenalty << ", " << CPairwiseUtilities::GapExtendPenalty << ")" << endl;

	// ==============
	// Start aligning
	// ==============

	ma.AlignReadArchiveLowMemory();
	//ma.AlignReadArchive();

	// ==================
	// Show total runtime
	// ==================

	// stop benchmarking
	bench.Stop();

	// show the benchmarking results
	cout << endl;
	bench.DisplayTime("MosaikAligner");

	return 0;
}
Пример #2
0
// aligns the read archive
void CMosaikAligner::AlignReadArchive(MosaikReadFormat::CReadReader& in, MosaikReadFormat::CAlignmentWriter& out, unsigned int* pRefBegin, unsigned int* pRefEnd, char** pBsRefSeqs) {

	// ==============
	// initialization
	// ==============

	// retrieve the concatenated reference sequence length
	/*
	vector<ReferenceSequence> referenceSequences;

	MosaikReadFormat::CReferenceSequenceReader refseq;
	refseq.Open(mSettings.ReferenceFilename);
	refseq.GetReferenceSequences(referenceSequences);
	mReferenceLength = refseq.GetReferenceSequenceLength();
	const unsigned int numRefSeqs = refseq.GetNumReferenceSequences();

	// retrieve the basespace reference filenames
	char** pBsRefSeqs = NULL;
	if(mFlags.EnableColorspace) {

		cout << "- loading basespace reference sequences... ";
		cout.flush();

		MosaikReadFormat::CReferenceSequenceReader bsRefSeq;
		bsRefSeq.Open(mSettings.BasespaceReferenceFilename);

		if(!bsRefSeq.HasSameReferenceSequences(referenceSequences)) {
			printf("ERROR: The basespace and colorspace reference sequence archives do not seem to represent the same FASTA file.\n"); 
			exit(1);
		}

		bsRefSeq.CopyReferenceSequences(pBsRefSeqs);
		bsRefSeq.Close();

		cout << "finished." << endl;
	}

	// initialize our hash tables
	InitializeHashTables(CalculateHashTableSize(mReferenceLength, mSettings.HashSize));

	// hash the concatenated reference sequence
	if(!mFlags.IsUsingJumpDB) HashReferenceSequence(refseq);

	cout << "- loading reference sequence... ";
	cout.flush();
	refseq.LoadConcatenatedSequence(mReference);
	cout << "finished." << endl;

	refseq.Close();

	// create our reference sequence LUTs
	unsigned int* pRefBegin = new unsigned int[numRefSeqs];
	unsigned int* pRefEnd   = new unsigned int[numRefSeqs];

	for(unsigned int j = 0; j < numRefSeqs; j++) {
		pRefBegin[j] = referenceSequences[j].Begin;
		pRefEnd[j]   = referenceSequences[j].End;
	}


	// set the hash positions threshold
	if(mFlags.IsUsingHashPositionThreshold && (mAlgorithm == CAlignmentThread::AlignerAlgorithm_ALL)) 
		mpDNAHash->RandomizeAndTrimHashPositions(mSettings.HashPositionThreshold);

	// localize the read archive filenames
	string inputReadArchiveFilename  = mSettings.InputReadArchiveFilename;
	string outputReadArchiveFilename = mSettings.OutputReadArchiveFilename;

	// define our read format reader and writer
	MosaikReadFormat::CReadReader in;
	in.Open(inputReadArchiveFilename);
	MosaikReadFormat::ReadGroup readGroup = in.GetReadGroup();
	*/

	ReadStatus readStatus          = in.GetStatus();
	
	/*
	mSettings.SequencingTechnology = readGroup.SequencingTechnology;
	mSettings.MedianFragmentLength = readGroup.MedianFragmentLength;
	*/

	
	const bool isPairedEnd = (readStatus == RS_PAIRED_END_READ ? true : false);

	/*
	vector<MosaikReadFormat::ReadGroup> readGroups;
	readGroups.push_back(readGroup);

	// set the alignment status flags
	AlignmentStatus alignmentStatus = AS_UNSORTED_READ | readStatus;
	if(mMode == CAlignmentThread::AlignerMode_ALL) alignmentStatus |= AS_ALL_MODE;
	else alignmentStatus |= AS_UNIQUE_MODE;

	MosaikReadFormat::CAlignmentWriter out;
	out.Open(mSettings.OutputReadArchiveFilename.c_str(), referenceSequences, readGroups, alignmentStatus);
	*/

	// open the unaligned read report file
	FILE* unalignedStream = NULL;
	if(mFlags.IsReportingUnalignedReads) {
		if(fopen_s(&unalignedStream, mSettings.UnalignedReadReportFilename.c_str(), "wb") != 0) {
			cout << "ERROR: Unable to open the unaligned read FASTQ file for output." << endl;
			exit(1);
		}
	}
	


	// localize our read and reference counts. Initialize our statistical counters
	uint64_t numReadArchiveReads = in.GetNumReads();
	uint64_t readCounter = 0;

	// initialize our threads
	pthread_t* activeThreads = new pthread_t[mSettings.NumThreads];

	CAlignmentThread::ThreadData td;
	td.Algorithm           = mAlgorithm;
	td.ReferenceLen        = mReferenceLength;
	td.Filters             = mFilters;
        td.SplitFilters        = mSplitFilters;
	td.Flags               = mFlags;
	td.Mode                = mMode;
	td.pReference          = mReference;
	td.pCounters           = &mStatisticsCounters;
	td.pDnaHash            = mpDNAHash;
	td.pIn                 = &in;
	td.pOut                = &out;
	td.pUnalignedStream    = unalignedStream;
	td.pRefBegin           = pRefBegin;
	td.pRefEnd             = pRefEnd;
	td.Settings            = mSettings;
	td.pReadCounter        = &readCounter;
	td.IsPairedEnd         = isPairedEnd;
	td.pBsRefSeqs          = pBsRefSeqs;

	// unenable EnableColorspace flag for low-memory algorithm, deal with the SOLiD convertion when sorting the aligned archives
	//if ( mFlags.UseLowMemory )
	//	td.Flags.EnableColorspace = false;

	pthread_attr_t attr;
	pthread_attr_init(&attr);
	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);

	pthread_mutex_init(&CAlignmentThread::mGetReadMutex,              NULL);
	pthread_mutex_init(&CAlignmentThread::mReportUnalignedMate1Mutex, NULL);
	pthread_mutex_init(&CAlignmentThread::mReportUnalignedMate2Mutex, NULL);
	pthread_mutex_init(&CAlignmentThread::mSaveReadMutex,             NULL);
	pthread_mutex_init(&CAlignmentThread::mStatisticsMutex,           NULL);
	pthread_mutex_init(&CAbstractDnaHash::mJumpCacheMutex,            NULL);
	pthread_mutex_init(&CAbstractDnaHash::mJumpKeyMutex,              NULL);
	pthread_mutex_init(&CAbstractDnaHash::mJumpPositionMutex,         NULL);

	// ===========================
	// start our alignment threads
	// ===========================

	// initialize our progress bar
	
	if ( !mFlags.UseLowMemory ) {
		CConsole::Heading();
		cout << endl;
	}
	cout << "Aligning read library (" << numReadArchiveReads << "):" << endl;
	if ( !mFlags.UseLowMemory )
	CConsole::Reset();

	CProgressBar<uint64_t>::StartThread(&readCounter, 0, numReadArchiveReads, "reads");

	// create our threads
	for(unsigned int i = 0; i < mSettings.NumThreads; i++)
		pthread_create(&activeThreads[i], &attr, CAlignmentThread::StartThread, (void*)&td);

	pthread_attr_destroy(&attr);

	CBenchmark alignmentBench;
	alignmentBench.Start();

	// wait for the threads to complete
	void* status = NULL;
	for(unsigned int i = 0; i < mSettings.NumThreads; i++) 
		pthread_join(activeThreads[i], &status);

	// wait for the progress bar to finish
	CProgressBar<uint64_t>::WaitThread();

	alignmentBench.Stop();

	// free up some memory
	//delete [] mReference;
	delete [] activeThreads;
	activeThreads = NULL;
	//if(pRefBegin) delete [] pRefBegin;
	//if(pRefEnd)   delete [] pRefEnd;

	//if(pBsRefSeqs) {
	//	for(unsigned int i = 0; i < numRefSeqs; ++i) delete [] pBsRefSeqs[i];
	//	delete [] pBsRefSeqs;
	//}

	// close open file streams
	//in.Close();
	
	// solid references should be one-base longer after converting back to basespace
	//if(mFlags.EnableColorspace) out.AdjustSolidReferenceBases();
	//out.Close();

	if(mFlags.IsReportingUnalignedReads) fclose(unalignedStream);
	//if(mFlags.IsUsingJumpDB) mpDNAHash->FreeMemory();

	// ====================
	// print our statistics
	// ====================
	/*
	const uint64_t totalMates = mStatisticsCounters.ShortMates +
		mStatisticsCounters.FailedHashMates +
		mStatisticsCounters.UniqueMates +
		mStatisticsCounters.NonUniqueMates +
		mStatisticsCounters.FilteredOutMates;

	const uint64_t totalAlignedMates = mStatisticsCounters.UniqueMates + mStatisticsCounters.NonUniqueMates;
	const uint64_t totalAlignedReads = mStatisticsCounters.AlignedReads;

	// print our alignment statistics (mates) if don't enable low-memory algorithm
	if ( !mFlags.UseLowMemory ) {

	printf("\n");
	CConsole::Heading(); printf("Alignment statistics (mates):\n"); CConsole::Reset();
	printf("===================================\n");

	if(mStatisticsCounters.ShortMates > 0)
		printf("# too short:    %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.ShortMates,       (mStatisticsCounters.ShortMates       / (double)totalMates) * 100.0);

	if(mStatisticsCounters.FailedHashMates > 0)
		printf("# failed hash:  %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.FailedHashMates,  (mStatisticsCounters.FailedHashMates  / (double)totalMates) * 100.0);

	if(mStatisticsCounters.FilteredOutMates > 0)
		printf("# filtered out: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.FilteredOutMates, (mStatisticsCounters.FilteredOutMates / (double)totalMates) * 100.0);

	if(mStatisticsCounters.UniqueMates > 0)
		printf("# unique:       %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.UniqueMates,      (mStatisticsCounters.UniqueMates      / (double)totalMates) * 100.0);

	if(mStatisticsCounters.NonUniqueMates > 0)
		printf("# non-unique:   %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.NonUniqueMates,   (mStatisticsCounters.NonUniqueMates   / (double)totalMates) * 100.0);

	printf("-----------------------------------\n");
	printf("total:          %9llu\n", (unsigned long long)totalMates);
	printf("total aligned:  ");
	CConsole::Bold(); printf("%9llu", (unsigned long long)totalAlignedMates); CConsole::Reset();
	printf(" (");
	CConsole::Bold(); printf("%5.1f %%", (totalAlignedMates / (double)totalMates) * 100.0); CConsole::Reset();
	printf(")\n");

	// print our local alignment search statistics
	if(mFlags.UseLocalAlignmentSearch) {
		printf("\n");
		CConsole::Heading(); printf("Local alignment search statistics:\n"); CConsole::Reset();
		printf("===================================\n");

		double rescuedAlignmentsPercent = mStatisticsCounters.AdditionalLocalMates / (double)totalMates * 100.0;
		printf("rescued mates:  %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.AdditionalLocalMates, rescuedAlignmentsPercent);
	}

	// print our alignment statistics (reads)
	if(isPairedEnd) {
		printf("\n");
		CConsole::Heading(); printf("Alignment statistics (reads):\n"); CConsole::Reset();
		printf("============================================\n");
		printf("# unaligned:             %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.UnalignedReads,     (mStatisticsCounters.UnalignedReads     / (double)numReadArchiveReads) * 100.0);
		printf("# orphaned:              %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.OrphanedReads,      (mStatisticsCounters.OrphanedReads      / (double)numReadArchiveReads) * 100.0);
		printf("# both mates unique:     %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.BothUniqueReads,    (mStatisticsCounters.BothUniqueReads    / (double)numReadArchiveReads) * 100.0);
		printf("# one mate non-unique:   %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.OneNonUniqueReads,  (mStatisticsCounters.OneNonUniqueReads  / (double)numReadArchiveReads) * 100.0);
		printf("# both mates non-unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.BothNonUniqueReads, (mStatisticsCounters.BothNonUniqueReads / (double)numReadArchiveReads) * 100.0);
		printf("--------------------------------------------\n");
		printf("total reads:             ");
		CConsole::Bold(); printf("%9llu", (unsigned long long)numReadArchiveReads); CConsole::Reset();
		printf("\n");
		printf("total reads aligned:     ");
		CConsole::Bold(); printf("%9llu", (unsigned long long)totalAlignedReads); CConsole::Reset();
		printf(" (");
		CConsole::Bold(); printf("%5.1f %%", (totalAlignedReads / (double)numReadArchiveReads) * 100.0); CConsole::Reset();
		printf(")\n");
	}

	// print our jump cache statistics
	if(mFlags.IsUsingJumpDB && (mSettings.NumCachedHashes > 0)) {
		printf("\n");
		CConsole::Heading(); printf("Jump database cache statistics:\n"); CConsole::Reset();
		printf("====================================\n");

		uint64_t cacheHits = 0, cacheMisses = 0, cacheTotal = 0;
		CJumpDnaHash* pJump = (CJumpDnaHash*)mpDNAHash;
		pJump->GetCacheStatistics(cacheHits, cacheMisses);

		cacheTotal = cacheHits + cacheMisses;
		double cacheHitsPercent = cacheHits / (double)cacheTotal * 100.0;

		printf("cache hits:   %10llu (%5.1f %%)\n", (unsigned long long)cacheHits, cacheHitsPercent);
		printf("cache misses: %10llu\n", (unsigned long long)cacheMisses);
	}

	printf("\n");
	CConsole::Heading(); printf("Miscellaneous statistics:\n"); CConsole::Reset();
	printf("==================================\n");
	printf("aligned mate bp:        %10llu\n", (unsigned long long)mStatisticsCounters.MateBasesAligned);
	printf("alignment candidates/s: %10.1f\n", mStatisticsCounters.AlignmentCandidates / alignmentBench.GetElapsedWallTime());
	}
	*/
	
}
Пример #3
0
// print our statistics
void CMosaikAligner::PrintStatistics () {

	MosaikReadFormat::CReadReader in;
        string inputReadArchiveFilename  = mSettings.InputReadArchiveFilename;
	in.Open(inputReadArchiveFilename);

	const uint64_t numReadArchiveReads = in.GetNumReads();
	ReadStatus readStatus        = in.GetStatus();
	const bool isPairedEnd = (readStatus == RS_PAIRED_END_READ ? true : false);
	const uint64_t totalMates = isPairedEnd ? numReadArchiveReads * 2 : numReadArchiveReads;
	
	if ( mFlags.UseLowMemory ) {
		mStatisticsCounters.ShortMates       = 0;
		mStatisticsCounters.FailedHashMates  = 0;
		mStatisticsCounters.UnalignedReads   = numReadArchiveReads - mStatisticsCounters.AlignedReads;
		mStatisticsCounters.FilteredOutMates = mStatisticsCounters.UnalignedReads;
	}

	const uint64_t totalAlignedMates = mStatisticsCounters.UniqueMates + mStatisticsCounters.NonUniqueMates;
	const uint64_t totalAlignedReads = mStatisticsCounters.AlignedReads;

	// print our alignment statistics (mates) if don't enable low-memory algorithm

	printf("\n");
	CConsole::Heading(); printf("Alignment statistics (mates):\n"); CConsole::Reset();
	printf("===================================\n");

	if(mStatisticsCounters.ShortMates > 0)
		printf("# too short:    %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.ShortMates,       (mStatisticsCounters.ShortMates       / (double)totalMates) * 100.0);

	if(mStatisticsCounters.FailedHashMates > 0)
		printf("# failed hash:  %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.FailedHashMates,  (mStatisticsCounters.FailedHashMates  / (double)totalMates) * 100.0);

	if(mStatisticsCounters.FilteredOutMates > 0)
		printf("# filtered out: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.FilteredOutMates, (mStatisticsCounters.FilteredOutMates / (double)totalMates) * 100.0);

	if(mStatisticsCounters.UniqueMates > 0)
		printf("# unique:       %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.UniqueMates,      (mStatisticsCounters.UniqueMates      / (double)totalMates) * 100.0);

	if(mStatisticsCounters.NonUniqueMates > 0)
		printf("# non-unique:   %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.NonUniqueMates,   (mStatisticsCounters.NonUniqueMates   / (double)totalMates) * 100.0);

	printf("-----------------------------------\n");
	printf("total:          %9llu\n", (unsigned long long)totalMates);
	printf("total aligned:  ");
	CConsole::Bold(); printf("%9llu", (unsigned long long)totalAlignedMates); CConsole::Reset();
	printf(" (");
	CConsole::Bold(); printf("%5.1f %%", (totalAlignedMates / (double)totalMates) * 100.0); CConsole::Reset();
	printf(")\n");

	// print our local alignment search statistics
	// we don't print out local alignment information when the low-memory approach is enabled.
	if( !mFlags.UseLowMemory && mFlags.UseLocalAlignmentSearch ) {
		printf("\n");
		CConsole::Heading(); printf("Local alignment search statistics:\n"); CConsole::Reset();
		printf("===================================\n");

		double rescuedAlignmentsPercent = mStatisticsCounters.AdditionalLocalMates / (double)totalMates * 100.0;
		printf("rescued mates:  %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.AdditionalLocalMates, rescuedAlignmentsPercent);
	}

	// print our alignment statistics (reads)
	if(isPairedEnd) {
		printf("\n");
		CConsole::Heading(); printf("Alignment statistics (reads):\n"); CConsole::Reset();
		printf("============================================\n");
		printf("# unaligned:             %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.UnalignedReads,     (mStatisticsCounters.UnalignedReads     / (double)numReadArchiveReads) * 100.0);
		printf("# orphaned:              %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.OrphanedReads,      (mStatisticsCounters.OrphanedReads      / (double)numReadArchiveReads) * 100.0);
		printf("# both mates unique:     %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.BothUniqueReads,    (mStatisticsCounters.BothUniqueReads    / (double)numReadArchiveReads) * 100.0);
		printf("# one mate non-unique:   %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.OneNonUniqueReads,  (mStatisticsCounters.OneNonUniqueReads  / (double)numReadArchiveReads) * 100.0);
		printf("# both mates non-unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.BothNonUniqueReads, (mStatisticsCounters.BothNonUniqueReads / (double)numReadArchiveReads) * 100.0);
		printf("--------------------------------------------\n");
		printf("total reads:             ");
		CConsole::Bold(); printf("%9llu", (unsigned long long)numReadArchiveReads); CConsole::Reset();
		printf("\n");
		printf("total reads aligned:     ");
		CConsole::Bold(); printf("%9llu", (unsigned long long)totalAlignedReads); CConsole::Reset();
		printf(" (");
		CConsole::Bold(); printf("%5.1f %%", (totalAlignedReads / (double)numReadArchiveReads) * 100.0); CConsole::Reset();
		printf(")\n");
	}

	// print our jump cache statistics
	if( !mFlags.UseLowMemory && mFlags.IsUsingJumpDB && (mSettings.NumCachedHashes > 0)) {
		printf("\n");
		CConsole::Heading(); printf("Jump database cache statistics:\n"); CConsole::Reset();
		printf("====================================\n");

		uint64_t cacheHits = 0, cacheMisses = 0, cacheTotal = 0;
		CJumpDnaHash* pJump = (CJumpDnaHash*)mpDNAHash;
		pJump->GetCacheStatistics(cacheHits, cacheMisses);

		cacheTotal = cacheHits + cacheMisses;
		double cacheHitsPercent = cacheHits / (double)cacheTotal * 100.0;

		printf("cache hits:   %10llu (%5.1f %%)\n", (unsigned long long)cacheHits, cacheHitsPercent);
		printf("cache misses: %10llu\n", (unsigned long long)cacheMisses);
	}

	//if ( !mFlags.UseLowMemory ) {
	//	printf("\n");
	//	CConsole::Heading(); printf("Miscellaneous statistics:\n"); CConsole::Reset();
	//	printf("==================================\n");
	//	printf("aligned mate bp:        %10llu\n", (unsigned long long)mStatisticsCounters.MateBasesAligned);
	//	printf("alignment candidates/s: %10.1f\n", mStatisticsCounters.AlignmentCandidates / alignmentBench.GetElapsedWallTime());
	//}
	
	
	
}
Пример #4
0
void CMosaikAligner::AlignReadArchiveLowMemory(void) {

	// ==============
	// initialization
	// ==============

	// retrieve the concatenated reference sequence length
	// vector<ReferenceSequence> referenceSequences;

	MosaikReadFormat::CReferenceSequenceReader refseq;
	refseq.Open(mSettings.ReferenceFilename);
	refseq.GetReferenceSequences(referenceSequences);
	mReferenceLength = refseq.GetReferenceSequenceLength();
	const unsigned int numRefSeqs = refseq.GetNumReferenceSequences();
	refseq.Close();

	// retrieve the basespace reference filenames
	//char** pBsRefSeqs = NULL;
	if(mFlags.EnableColorspace) {
		
		MosaikReadFormat::CReferenceSequenceReader bsRefSeq;
		bsRefSeq.Open(mSettings.BasespaceReferenceFilename);

		if(!bsRefSeq.HasSameReferenceSequences(referenceSequences)) {
			printf("ERROR: The basespace and colorspace reference sequence archives do not seem to represent the same FASTA file.\n"); 
			exit(1);
		}

		bsRefSeq.Close();
	}

	// initialize our hash tables
	//InitializeHashTables(CalculateHashTableSize(mReferenceLength, mSettings.HashSize));

	// hash the concatenated reference sequence
	//if(!mFlags.IsUsingJumpDB) {
	//	InitializeHashTables(CalculateHashTableSize(mReferenceLength, mSettings.HashSize), 0, 0, 0);
	//	HashReferenceSequence(refseq);
	//}

	//cout << "- loading reference sequence... ";
	//cout.flush();
	//refseq.LoadConcatenatedSequence(mReference);
	//cout << "finished." << endl;

	// create our reference sequence LUTs
	//unsigned int* pRefBegin = new unsigned int[numRefSeqs];
	//unsigned int* pRefEnd   = new unsigned int[numRefSeqs];
	//
	//for(unsigned int j = 0; j < numRefSeqs; j++) {
	//	pRefBegin[j] = referenceSequences[j].Begin;
	//	pRefEnd[j]   = referenceSequences[j].End;
	//}

	string inputReadArchiveFilename  = mSettings.InputReadArchiveFilename;
	

	if ( !mFlags.UseLowMemory ) {
		
		// prepare BS reference sequence for SOLiD data
		char** pBsRefSeqs = NULL;
		if(mFlags.EnableColorspace) {

			cout << "- loading basespace reference sequences... ";
			cout.flush();

			MosaikReadFormat::CReferenceSequenceReader bsRefSeq;
			bsRefSeq.Open(mSettings.BasespaceReferenceFilename);


			bsRefSeq.CopyReferenceSequences(pBsRefSeqs);
			bsRefSeq.Close();

			cout << "finished." << endl;
		}

		// prepare reference sequence
		refseq.Open(mSettings.ReferenceFilename);
		cout << "- loading reference sequence... ";
		cout.flush();
		refseq.LoadConcatenatedSequence(mReference);
		cout << "finished." << endl;
		refseq.Close();
		
		unsigned int* pRefBegin = new unsigned int[numRefSeqs];
		unsigned int* pRefEnd   = new unsigned int[numRefSeqs];
		for(unsigned int j = 0; j < numRefSeqs; j++) {
			pRefBegin[j] = referenceSequences[j].Begin;
			pRefEnd[j]   = referenceSequences[j].End;
		}
		
		// initialize our hash tables
		if(!mFlags.IsUsingJumpDB) {
			InitializeHashTables(CalculateHashTableSize(mReferenceLength, mSettings.HashSize), 0, 0, 0, mFlags.UseLowMemory, 0);
			HashReferenceSequence(refseq);
		}
		else {
			InitializeHashTables(CalculateHashTableSize(mReferenceLength, mSettings.HashSize), pRefBegin[0], pRefEnd[numRefSeqs - 1], 0, mFlags.UseLowMemory, 0);
			mpDNAHash->LoadKeysNPositions();
		}

		// set the hash positions threshold
		if(mFlags.IsUsingHashPositionThreshold && (mAlgorithm == CAlignmentThread::AlignerAlgorithm_ALL))
			mpDNAHash->RandomizeAndTrimHashPositions(mSettings.HashPositionThreshold);

		// localize the read archive filenames
		string outputReadArchiveFilename = mSettings.OutputReadArchiveFilename;

		// define our read format reader and writer
		MosaikReadFormat::CReadReader in;
		in.Open(inputReadArchiveFilename);
		MosaikReadFormat::ReadGroup readGroup = in.GetReadGroup();
		ReadStatus readStatus          = in.GetStatus();
		mSettings.SequencingTechnology = readGroup.SequencingTechnology;
		mSettings.MedianFragmentLength = readGroup.MedianFragmentLength;

		vector<MosaikReadFormat::ReadGroup> readGroups;
		readGroups.push_back(readGroup);

		// set the alignment status flags
		AlignmentStatus alignmentStatus = AS_UNSORTED_READ | readStatus;
		if(mMode == CAlignmentThread::AlignerMode_ALL) alignmentStatus |= AS_ALL_MODE;
		else alignmentStatus |= AS_UNIQUE_MODE;

		MosaikReadFormat::CAlignmentWriter out;
		out.Open(mSettings.OutputReadArchiveFilename.c_str(), referenceSequences, readGroups, alignmentStatus, ALIGNER_SIGNATURE);

		AlignReadArchive(in, out, pRefBegin, pRefEnd, pBsRefSeqs);

		// close open file streams
		in.Close();

		// solid references should be one-base longer after converting back to basespace
		if(mFlags.EnableColorspace) out.AdjustSolidReferenceBases();
		out.Close();

		// free memory
		if(mFlags.IsUsingJumpDB) mpDNAHash->FreeMemory();
		if(pRefBegin)  delete [] pRefBegin;
		if(pRefEnd)    delete [] pRefEnd;
		if(mReference) delete [] mReference;
		if(pBsRefSeqs) {
			for(unsigned int i = 0; i < numRefSeqs; ++i) delete [] pBsRefSeqs[i];
			delete [] pBsRefSeqs;
		}
		pRefBegin  = NULL;
		pRefEnd    = NULL;
		mReference = NULL;
		pBsRefSeqs = NULL;
	}
	else {
		// grouping reference and store information in referenceGroups vector
		// vector< pair <unsigned int, unsigned int> > referenceGroups;
		GroupReferences();
		
		// get hash statistics for adjusting mhp for each reference group and reserve memory
		vector< unsigned int > nHashs;             // the numbers of hash positions in each reference group
		vector< unsigned int > expectedMemories;   // the numbers of hashs in each reference group
		uint64_t nTotalHash;
		GetHashStatistics( nHashs, expectedMemories, nTotalHash );
		
		// align reads again per chromosome group
		for ( unsigned int i = 0; i < referenceGroups.size(); i++) {

	        	unsigned int startRef = referenceGroups[i].first;
			unsigned int endRef   = referenceGroups[i].first + referenceGroups[i].second - 1;
			CConsole::Heading();
		        if ( referenceGroups[i].second > 1 )
				cout << endl << "Aligning chromosome " << startRef + 1 << "-" << endRef + 1 << " (of " << numRefSeqs << "):" << endl;
			else
				cout << endl << "Aligning chromosome " << startRef + 1 << " (of " << numRefSeqs << "):" << endl;
		        CConsole::Reset();

			// initialize our hash tables
			// calculate expected memories for jump data
			unsigned int expectedMemory = nHashs[i] + expectedMemories[i];
			// reserve 3% more memory for unexpected usage
			expectedMemory =  expectedMemory * 1.03;

			InitializeHashTables(0, referenceSequences[startRef].Begin, referenceSequences[endRef].End, referenceSequences[startRef].Begin, mFlags.UseLowMemory, expectedMemory);

			// set the hash positions threshold
			if(mFlags.IsUsingHashPositionThreshold && (mAlgorithm == CAlignmentThread::AlignerAlgorithm_ALL)) { 
				double ratio = nHashs[i] / (double)nTotalHash;
				unsigned int positionThreshold = ceil(ratio * (double)mSettings.HashPositionThreshold);
				//cout << positionThreshold << endl;
				mpDNAHash->RandomizeAndTrimHashPositions(positionThreshold);
			}

			// load jump data
			mpDNAHash->LoadKeysNPositions();

			// set reference information
			unsigned int* pRefBegin = new unsigned int[referenceGroups[i].second];
			unsigned int* pRefEnd   = new unsigned int[referenceGroups[i].second];
			for ( unsigned int j = 0; j < referenceGroups[i].second; j++ ){
				pRefBegin[j] = referenceSequences[startRef+j].Begin - referenceSequences[startRef].Begin;
				pRefEnd[j]   = referenceSequences[startRef+j].End   - referenceSequences[startRef].Begin;
			}

			// prepare BS reference sequence for SOLiD data
			char** pBsRefSeqs = NULL;
			if(mFlags.EnableColorspace) {
	
				cout << "- loading basespace reference sequences... ";
				cout.flush();

				MosaikReadFormat::CReferenceSequenceReader bsRefSeq;
				bsRefSeq.Open(mSettings.BasespaceReferenceFilename);


				bsRefSeq.CopyReferenceSequences(pBsRefSeqs, startRef, referenceGroups[i].second);
				bsRefSeq.Close();

				cout << "finished." << endl;
			}

			// prepare reference sequence
			refseq.Open(mSettings.ReferenceFilename);
			cout << "- loading reference sequence... ";
			cout.flush();
			//refseq.LoadConcatenatedSequence(mReference);
			refseq.LoadConcatenatedSequence(mReference, startRef, referenceGroups[i].second);
			refseq.Close();

			// trim reference sequence
			//unsigned int chrLength = referenceSequences[endRef].End - referenceSequences[startRef].Begin + 1;
			//char* chrReference  = new char[ chrLength + 1 ];
			//char* mReferencePtr = mReference + referenceSequences[startRef].Begin;
			//memcpy( chrReference, mReferencePtr, chrLength);
			//chrReference[chrLength] = 0;
			//delete [] mReference;
			//mReference = chrReference;
			cout << "finished." << endl;
			
			
			// localize the read archive filenames
			// get a temporary file name
			string tempFilename;
			CFileUtilities::GetTempFilename(tempFilename);
			outputFilenames.push_back(tempFilename);

			// define our read format reader and writer
			MosaikReadFormat::CReadReader in;
			in.Open(inputReadArchiveFilename);
			MosaikReadFormat::ReadGroup readGroup = in.GetReadGroup();
			ReadStatus readStatus          = in.GetStatus();
			mSettings.SequencingTechnology = readGroup.SequencingTechnology;
			mSettings.MedianFragmentLength = readGroup.MedianFragmentLength;

			vector<MosaikReadFormat::ReadGroup> readGroups;
			readGroups.push_back(readGroup);

			// set the alignment status flags
			AlignmentStatus alignmentStatus = AS_UNSORTED_READ | readStatus;
			if(mMode == CAlignmentThread::AlignerMode_ALL) alignmentStatus |= AS_ALL_MODE;
			else alignmentStatus |= AS_UNIQUE_MODE;

			// prepare a new vector for the current chromosome for opening out archive
			vector<ReferenceSequence> smallReferenceSequences;
			for ( unsigned int j = 0; j < referenceGroups[i].second; j++ ){
				smallReferenceSequences.push_back(referenceSequences[startRef+j]);
			}

			MosaikReadFormat::CAlignmentWriter out;
			out.Open(tempFilename.c_str(), smallReferenceSequences, readGroups, alignmentStatus, ALIGNER_SIGNATURE);
			out.AdjustPartitionSize(20000/referenceGroups.size());


			AlignReadArchive(in, out, pRefBegin, pRefEnd, pBsRefSeqs);

			// close open file streams
			in.Close();

			// solid references should be one-base longer after converting back to basespace
			if(mFlags.EnableColorspace) out.AdjustSolidReferenceBases();
			out.Close();

			// free memory
			if(mFlags.IsUsingJumpDB) mpDNAHash->FreeMemory();
			if(pRefBegin)  delete [] pRefBegin;
			if(pRefEnd)    delete [] pRefEnd;
			if(mReference) delete [] mReference;
			if(pBsRefSeqs) {
				for(unsigned int j = 0; j < referenceGroups[i].second; j++)
					delete [] pBsRefSeqs[j];
				
				delete [] pBsRefSeqs;
			}
			pRefBegin  = NULL;
			pRefEnd    = NULL;
			mReference = NULL;
			pBsRefSeqs = NULL;
		}
	}

	if ( mFlags.UseLowMemory )
		MergeArchives();

	PrintStatistics();
}