Ejemplo n.º 1
0
/**
 * Main work method.  Reads the BAM file once and collects sorted information about
 * the 5' ends of both ends of each read (or just one end in the case of pairs).
 * Then makes a pass through those determining duplicates before re-reading the
 * input file and writing it out with duplication flags set correctly.
 */
int MarkDuplicates::runInternal() {
    
    ogeNameThread("am_MarkDuplicates");

    if(verbose)
        cerr << "Reading input file and constructing read end information." << endl;
    buildSortedReadEndLists();
    
    generateDuplicateIndexes();
    
    if(verbose)
        cerr << "Marking " << numDuplicateIndices << " records as duplicates." << endl;
    
    BamReader in;

    in.Open(getBufferFileName());

    // Now copy over the file while marking all the necessary indexes as duplicates
    long recordInFileIndex = 0;
    
    long written = 0;
    while (true) {
        BamAlignment * prec = in.GetNextAlignment();
        if(!prec)
            break;
        
        if (prec->IsPrimaryAlignment()) {
            if (duplicateIndexes.count(recordInFileIndex) == 1)
                prec->SetIsDuplicate(true);
            else
                prec->SetIsDuplicate(false);
        }
        recordInFileIndex++;
        
        if (removeDuplicates && prec->IsDuplicate()) {
            // do nothing
        }
        else {
            putOutputAlignment(prec);
            if (verbose && read_count && ++written % 100000 == 0) {
                cerr << "\rWritten " << written << " records (" << written * 100 / read_count <<"%)." << std::flush;
            }
        }
    }

    if (verbose && read_count)
        cerr << "\rWritten " << written << " records (" << written * 100 / read_count <<"%)." << endl;

    in.Close();

    remove(getBufferFileName().c_str());
    
    return 0;
}
Ejemplo n.º 2
0
int DataStatisticsTool::Execute()
{
    // iterate over reads in BAM file(s)
    BamAlignment alignObj;
    while(bamReader.GetNextAlignment(alignObj))
    {
        if (alignObj.IsDuplicate()) continue;
        if (alignObj.IsFailedQC()) continue;
        if (!alignObj.IsMapped()) continue;
        if (!alignObj.IsPrimaryAlignment()) continue;
        if (alignObj.IsPaired() && !alignObj.IsProperPair()) continue;
        if (alignObj.IsPaired() && !alignObj.IsMateMapped()) continue;
        if (!alignObj.HasTag("MD")) continue;

//        // debug
//        GenericBamAlignmentTools::printBamAlignmentCigar(alignObj);
//        GenericBamAlignmentTools::printBamAlignmentMD(alignObj);

        // shift InDel
        GenericBamAlignmentTools::leftShiftInDel(alignObj);

//        // debug
//        GenericBamAlignmentTools::printBamAlignmentCigar(alignObj);
//        GenericBamAlignmentTools::printBamAlignmentMD(alignObj);

        // get the alignment sequences
        string alignRead;
        string alignGenome;
        GenericBamAlignmentTools::getAlignmentSequences(alignObj, alignRead, alignGenome);

        // update the statistics
        statistics.update(alignRead, alignGenome);
    }


    // print to screen
    cout << statistics << endl;
//    statistics.printMatchMismatch();

    // close BAM reader
    bamReader.Close();

    // close Fasta
    genomeFasta.Close();

    return 1;
}
Ejemplo n.º 3
0
 bool check(const PropertyFilter& filter, const BamAlignment& al) {
   
     bool keepAlignment = true;
     const PropertyMap& properties = filter.Properties;
     PropertyMap::const_iterator propertyIter = properties.begin();
     PropertyMap::const_iterator propertyEnd  = properties.end();
     for ( ; propertyIter != propertyEnd; ++propertyIter ) {
       
         // check alignment data field depending on propertyName
         const string& propertyName = (*propertyIter).first;
         const PropertyFilterValue& valueFilter = (*propertyIter).second;
         
         if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
         else if ( propertyName == CIGAR_PROPERTY ) {
             stringstream cigarSs;
             const vector<CigarOp>& cigarData = al.CigarData;
             if ( !cigarData.empty() ) {
                 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
                 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
                 vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
                 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
                     const CigarOp& op = (*cigarIter);
                     cigarSs << op.Length << op.Type;
                 }
                 keepAlignment &= valueFilter.check(cigarSs.str());
             }
         }
         else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
         else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
         else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
         else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
         else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
         else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
         else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
         else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
         else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
         else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
         else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
         else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
         else if ( propertyName == ISSINGLETON_PROPERTY ) {
             const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
             keepAlignment &= valueFilter.check(isSingleton);
         }
         else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
         else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
         else if ( propertyName == MATEREFERENCE_PROPERTY ) {
             if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
             BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
             const string& refName = filterToolReferences.at(al.MateRefID).RefName;
             keepAlignment &= valueFilter.check(refName);
         }
         else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
         else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
         else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
         else if ( propertyName == REFERENCE_PROPERTY ) {
             BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
             const string& refName = filterToolReferences.at(al.RefID).RefName;
             keepAlignment &= valueFilter.check(refName);
         }
         else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
         else BAMTOOLS_ASSERT_UNREACHABLE;
         
         // if alignment fails at ANY point, just quit and return false
         if ( !keepAlignment ) return false;
     }
   
     BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
     return keepAlignment;
 }
Ejemplo n.º 4
0
	virtual void main()
	{
		//init
		QTextStream out(stdout);
		BamReader reader;
		NGSHelper::openBAM(reader, getInfile("in"));

		FastqOutfileStream out1(getOutfile("out1"), false);
		FastqOutfileStream out2(getOutfile("out2"), false);

		long long c_unpaired = 0;
		long long c_paired = 0;
		int max_cached = 0;

		//iterate through reads
		BamAlignment al;
		QHash<QByteArray, BamAlignment> al_cache;
		while (reader.GetNextAlignment(al))
		{
			//skip secondary alinments
			if(!al.IsPrimaryAlignment()) continue;

			//skip unpaired
			if(!al.IsPaired())
			{
				++c_unpaired;
				continue;
			}

			QByteArray name(al.Name.data()); //TODO use QByteArray::fromStdString (when upgraded to Qt5.4)

			//store cached read when we encounter the mate
			if (al_cache.contains(name))
			{
				BamAlignment mate = al_cache.take(name);
				//out << name << " [AL] First: " << al.IsFirstMate() << " Reverse: " << al.IsReverseStrand() << " Seq: " << al.QueryBases.data() << endl;
				//out << name << " [MA] First: " << mate.IsFirstMate() << " Reverse: " << mate.IsReverseStrand() << " Seq: " << mate.QueryBases.data() << endl;
				if (al.IsFirstMate())
				{
					write(out1, al, al.IsReverseStrand());
					write(out2, mate, mate.IsReverseStrand());
				}
				else
				{
					write(out1, mate, mate.IsReverseStrand());
					write(out2, al, al.IsReverseStrand());
				}
				++c_paired;
			}
			//cache read for later retrieval
			else
			{
				al_cache.insert(name, al);
			}

			max_cached = std::max(max_cached, al_cache.size());
		}
		reader.Close();
		out1.close();
		out2.close();

		//write debug output
		out << "Pair reads (written)            : " << c_paired << endl;
		out << "Unpaired reads (skipped)        : " << c_unpaired << endl;
		out << "Unmatched paired reads (skipped): " << al_cache.size() << endl;
		out << endl;
		out << "Maximum cached reads            : " << max_cached << endl;
	}