Ejemplo n.º 1
0
CArchiveMerge::CArchiveMerge ( 
	const vector <string>& inputFilenames, 
	const string& outputFilename, 
	uint64_t           *readNo,
	const bool&          isSolid,
	const string&        commandLine,
	const string&        paired_end_ann_file,
	const string&        single_end_ann_file,
	const unsigned int&  fragmentLength,
	const unsigned int&  localAlignmentSearchRadius,
	const bool&          hasSpecial,
	const unsigned char& statMappingQuality)
	
	: _inputFilenames(inputFilenames)
	, _outputFilename(outputFilename)
	, _readNo                    (readNo)
	, _isSolid                   (isSolid)
	, _expectedFragmentLength    (fragmentLength)
	, _localAlignmentSearchRadius(localAlignmentSearchRadius)
	, _refIndex()
	, _referenceSequences()
	, _referenceSequencesWoSpecial()
	, _readGroups()
	, _alignmentStatus(0)
	, _isPairedEnd(false)
	, _hasSpecial                (hasSpecial)
	, _statMappingQuality        (statMappingQuality)
	, _sequencingTechnologies()
	, _readGroupsMap()
	, _specialArchiveName()
	, _specialCode1()
	, _specialCode2()
	, _specialReader()
	, _specialAl()
	, _special_owner(0)
	, _specialArchiveEmpty(true)
	, _specialReferenceSequences()
	, _counters()
	, _statisticsMaps()
	, _sHeader()
	, _rHeader()
	, _sBam()
	, _rBam()
	, za1()
	, za2()
	, _entropy()
	, _mqCalculator()
{
	//_statisticsMaps = mStatisticsMaps;
	
	MosaikReadFormat::CAlignmentReader reader;
	reader.Open( _inputFilenames[0] );
	reader.GetReadGroups(_readGroups);
	_alignmentStatus = reader.GetStatus();
	reader.Close();

	_isPairedEnd = ( ( _alignmentStatus & AS_PAIRED_END_READ ) != 0 ) ? true : false;

	_refIndex.resize(_inputFilenames.size(), 0);

	for ( vector<MosaikReadFormat::ReadGroup>::iterator ite = _readGroups.begin(); ite != _readGroups.end(); ++ite ) {
		ite->ReadGroupCode = ite->GetCode( *ite );
		_readGroupsMap[ ite->ReadGroupCode ] = *ite ;
	}

	_sequencingTechnologies = _readGroups[0].SequencingTechnology;

	// the last archive is the one containing alignments located at special references

	for ( unsigned int i = 0; i < _inputFilenames.size(); i++ ) {
		reader.Open( _inputFilenames[i] );
		
		// grab reference info from the archive
		vector<ReferenceSequence> referenceSequences;
		reader.GetReferenceSequences(referenceSequences);

		CopyReferenceString( referenceSequences );
		
		_referenceSequences.insert( _referenceSequences.end(), referenceSequences.begin(), referenceSequences.end() );
		_refIndex[i] = ( i == 0 ) ? referenceSequences.size() : referenceSequences.size() + _refIndex[i-1];

		// don't include the special references
		if ((_hasSpecial) && (i != ( _inputFilenames.size() - 1)))
			_referenceSequencesWoSpecial.insert( _referenceSequencesWoSpecial.end(), referenceSequences.begin(), referenceSequences.end() );

		// includes the special references
		if ((_hasSpecial) && (i == ( _inputFilenames.size() - 1)))
			_specialReferenceSequences.insert( _specialReferenceSequences.end(), referenceSequences.begin(), referenceSequences.end() );

		referenceSequences.clear();
		reader.Close();
	}

	if (!_hasSpecial)
		_referenceSequencesWoSpecial = _referenceSequences;
	/*
	if (_hasSpecial) {
	  for (unsigned int i = 0; i < _specialReferenceSequences.size(); ++i) {
	    printf("%s\n",_specialReferenceSequences[i].Species.c_str());
	  }
	}
	*/

	_sHeader.SortOrder = SORTORDER_UNSORTED;
	//_uHeader.SortOrder = SORTORDER_UNSORTED;
	_rHeader.SortOrder = SORTORDER_UNSORTED;

	_sHeader.pReferenceSequences = &_referenceSequences;
	//_uHeader.pReferenceSequences = &_referenceSequencesWoSpecial;
	_rHeader.pReferenceSequences = &_referenceSequencesWoSpecial;

	_sHeader.pReadGroups = &_readGroups;
	//_uHeader.pReadGroups = &_readGroups;
	_rHeader.pReadGroups = &_readGroups;

	ProgramGroup pg;
	pg.ID = "MosaikAligner";
	stringstream ss;
	ss << (int)MOSAIK_MAJOR_VERSION << "." << (int)MOSAIK_MINOR_VERSION << "." << (int)MOSAIK_BUILD_VERSION;
	pg.VN = ss.str();
	pg.CL = commandLine;

	_sHeader.pg.ID = "MosaikAligner";
	_rHeader.pg.ID = "MosaikAligner";
	_sHeader.pg.VN = ss.str();
	_rHeader.pg.VN = ss.str();
	_sHeader.pg.CL = commandLine;
	_rHeader.pg.CL = commandLine;

	_mqCalculator.Open(paired_end_ann_file, single_end_ann_file);

}
Ejemplo n.º 2
0
void CMosaikAligner::MergeArchives(void) {
	
	// set active threads
	unsigned int nThread = ( mSettings.NumThreads < outputFilenames.size() ) ? mSettings.NumThreads : outputFilenames.size();
	
	vector< string > temporaryFiles;
	temporaryFiles.resize( outputFilenames.size() );
	for ( unsigned int i = 0; i < outputFilenames.size(); i++ ) {
		string tempFilename;
		CFileUtilities::GetTempFilename(tempFilename);
		temporaryFiles[i] = tempFilename;
	}

        // calculate total # of reads
        unsigned int nReads = 0;
        for ( unsigned int i = 0 ; i < outputFilenames.size(); i++ ) {
	        MosaikReadFormat::CAlignmentReader reader;
                reader.Open( outputFilenames[i] );
                nReads += reader.GetNumReads();
                reader.Close();
        }


	// if nThread is too large, it'll open too many files at the same time.
	// Then, we'll get an error since system doesn't allow us to open any file.
	if ( nThread > 7 )
		nThread = 7;

	// prepare reference offset vector for SOLiD
	//vector<unsigned int> refOffsets;
	//refOffsets.resize(referenceGroups.size());
	//for ( unsigned int i = 0; i < referenceGroups.size(); i++ ) {
	//	unsigned int startRef = referenceGroups[i].first;
	//	refOffsets[i] = referenceSequences[startRef].Begin;
	//}

	CConsole::Heading();
	cout << endl << "Sorting alignment archive:" << endl;
	CConsole::Reset();
	SortThread sThread ( outputFilenames, temporaryFiles, nThread, nReads, mSettings.MedianFragmentLength );
	sThread.Start();

	CConsole::Heading();
	cout << "Merging alignment archive:" << endl;
	CConsole::Reset();

        unsigned int readNo        = 0;
	unsigned int nMaxAlignment = 1000;
        CProgressBar<unsigned int>::StartThread(&readNo, 0, nReads, "reads");
        CArchiveMerge merger( temporaryFiles, mSettings.OutputReadArchiveFilename, nMaxAlignment, &readNo );
        merger.Merge();
        CProgressBar<unsigned int>::WaitThread();

	for ( unsigned int i = 0; i < outputFilenames.size(); i++ )
		rm(outputFilenames[i].c_str());
	
	for ( unsigned int i = 0; i < temporaryFiles.size(); i++ )
		rm(temporaryFiles[i].c_str());

	// get statistics information
	CArchiveMerge::StatisticsCounters mergeCounters;
	merger.GetStatisticsCounters( mergeCounters );

	mStatisticsCounters.AlignedReads       = mergeCounters.AlignedReads;
	mStatisticsCounters.BothNonUniqueReads = mergeCounters.BothNonUniqueReads;
	mStatisticsCounters.BothUniqueReads    = mergeCounters.BothUniqueReads;
	mStatisticsCounters.OneNonUniqueReads  = mergeCounters.OneNonUniqueReads;
	mStatisticsCounters.OrphanedReads      = mergeCounters.OrphanedReads;
	mStatisticsCounters.FilteredOutMates   = mergeCounters.FilteredOutMates;
	mStatisticsCounters.NonUniqueMates     = mergeCounters.NonUniqueMates;
	mStatisticsCounters.UniqueMates        = mergeCounters.UniqueMates;

}