Ejemplo n.º 1
0
// Soft clip the record from the front and/or the back.
SamFilter::FilterStatus SamFilter::softClip(SamRecord& record, 
                                            int32_t numFrontClips,
                                            int32_t numBackClips)
{
    //////////////////////////////////////////////////////////
    Cigar* cigar = record.getCigarInfo();
    FilterStatus status = NONE;
    int32_t startPos = record.get0BasedPosition();
    CigarRoller updatedCigar;

    status = softClip(*cigar, numFrontClips, numBackClips, 
                      startPos, updatedCigar);

    if(status == FILTERED)
    {
        /////////////////////////////
        // The entire read is clipped, so rather than clipping it,
        // filter it out.
        filterRead(record);
        return(FILTERED);
    }
    else if(status == CLIPPED)
    {
        // Part of the read was clipped, and now that we have
        // an updated cigar, update the read.
        record.setCigar(updatedCigar);

        // Update the starting position.
        record.set0BasedPosition(startPos);
    }
    return(status);
}
Ejemplo n.º 2
0
bool SamFile::checkRecordInSection(SamRecord& record)
{
    bool recordFound = true;
    if(myRefID == BamIndex::REF_ID_ALL)
    {
        return(true);
    }
    // Check to see if it is in the correct reference/position.
    if(record.getReferenceID() != myRefID)
    {
        // Incorrect reference ID, return no more records.
        myStatus = SamStatus::NO_MORE_RECS;
        return(false);
    }
   
    // Found a record.
    recordFound = true;

    // If start/end position are set, verify that the alignment falls
    // within those.
    // If the alignment start is greater than the end of the region,
    // return NO_MORE_RECS.
    // Since myEndPos is Exclusive 0-based, anything >= myEndPos is outside
    // of the region.
    if((myEndPos != -1) && (record.get0BasedPosition() >= myEndPos))
    {
        myStatus = SamStatus::NO_MORE_RECS;
        return(false);
    }
        
    // We know the start is less than the end position, so the alignment
    // overlaps the region if the alignment end position is greater than the
    // start of the region.
    if((myStartPos != -1) && (record.get0BasedAlignmentEnd() < myStartPos))
    {
        // If it does not overlap the region, so go to the next
        // record...set recordFound back to false.
        recordFound = false;
    }

    if(!myOverlapSection)
    {
        // Needs to be fully contained.  Not fully contained if
        // 1) the record start position is < the region start position.
        // or
        // 2) the end position is specified and the record end position
        //    is greater than or equal to the region end position.
        //    (equal to since the region is exclusive.
        if((record.get0BasedPosition() < myStartPos) ||
           ((myEndPos != -1) && 
            (record.get0BasedAlignmentEnd() >= myEndPos)))
        {
            // This record is not fully contained, so move on to the next
            // record.
            recordFound = false;
        }
    }

    return(recordFound);
}
Ejemplo n.º 3
0
void Demux::match(){
	BamReader bamReader(bamFilePath);
	SamRecord samRecord;
	while (	bamReader.getNextRecord(samRecord)) {
		string recordName(samRecord.getReadName());
		recordName = cHandler.decrypt(recordName);
		//clean record name
		int len=recordName.find("$");
		recordName=recordName.substr(0,len);
		//clean ended
		if (!isDecoy(recordName)){
			string outputFile = generateFileName(recordName);
			printf("%s\n",outputFile.c_str());
			if (writers.find(outputFile) == writers.end()) //if the BamWriter is not initialized
				writers[outputFile] = new BamWriter(outputFile, bamReader.getHeader());
			BamWriter *writer = writers[outputFile];
			writer->writeRecord(samRecord);
		}
    }
    for(auto it=writers.begin();it!=writers.end();it++){
    	BamWriter *writer = it->second;
    	writer->close();
    	delete writer;
    }
}
Ejemplo n.º 4
0
void Bam2FastQ::handlePairedRN(SamRecord& samRec)
{
    static SamRecord* prevRec = NULL;
    static std::string prevRN = "";

    if(prevRec == NULL)
    {
        prevRec = &samRec;
    }
    else
    {
        if(strcmp(prevRec->getReadName(), samRec.getReadName()) != 0)
        {
            // Read Name does not match, error, did not find pair.
            std::cerr << "Paired Read, " << prevRec->getReadName()
                      << " but couldn't find mate, so writing as "
                      << "unpaired (single-ended)\n";
            ++myNumMateFailures;
            writeFastQ(*prevRec, myUnpairedFile,
                       myUnpairedFileNameExt);
            // Save this record to check against the next one.
            prevRec = &samRec;
        }
        else
        {
            // Matching ReadNames.
            // Found the mate.
            ++myNumPairs;
            // Check which is the first in the pair.
            if(SamFlag::isFirstFragment(samRec.getFlag()))
            {
                if(SamFlag::isFirstFragment(prevRec->getFlag()))
                {
                    std::cerr << "Both reads of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(samRec, myFirstFile, myFirstFileNameExt,
                           myFirstRNExt.c_str());
                writeFastQ(*prevRec, mySecondFile, mySecondFileNameExt,
                           mySecondRNExt.c_str());
            }
            else
            {
                if(!SamFlag::isFirstFragment(prevRec->getFlag()))
                {
                    std::cerr << "Neither read of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(*prevRec, myFirstFile, myFirstFileNameExt, myFirstRNExt.c_str());
                writeFastQ(samRec, mySecondFile, mySecondFileNameExt, mySecondRNExt.c_str());
            }
            // No previous record.
            prevRec = NULL;
        }
    }
}
Ejemplo n.º 5
0
// make a 64-bit genomic coordinate [24bit-chr][32bit-pos][8bit-orientation]
uint64_t getGenomicCoordinate(SamRecord& r) {
  // 64bit string consisting of
  // 24bit refID, 32bit pos, 8bit orientation
  if ( ( r.getReferenceID() < 0 ) || ( r.get0BasedPosition() < 0 ) ) {
    return UNMAPPED_GENOMIC_COORDINATE;
  }
  else {
    return ( ( static_cast<uint64_t>(r.getReferenceID()) << 40 ) | ( static_cast<uint64_t>(r.get0BasedPosition()) << 8 ) | static_cast<uint64_t>( r.getFlag() & 0x0010 ) );
  }
}
Ejemplo n.º 6
0
// Returns the number of bases in the passed in read that overlap the
// region that is currently set.
uint32_t SamFile::GetNumOverlaps(SamRecord& samRecord)
{
    if(myRefPtr != NULL)
    {
        samRecord.setReference(myRefPtr);
    }
    samRecord.setSequenceTranslation(myReadTranslation);

    // Get the overlaps in the sam record for the region currently set
    // for this file.
    return(samRecord.getNumOverlaps(myStartPos, myEndPos));
}
Ejemplo n.º 7
0
void BamInterface::readRecord(IFILE filePtr, SamFileHeader& header,
                              SamRecord& record,
                              SamStatus& samStatus)
{
    // TODO - need to validate there are @SQ lines in both sam/bam - MAYBE!

    // SetBufferFromFile will reset the record prior to reading a new one.
    if(record.setBufferFromFile(filePtr, header) != SamStatus::SUCCESS)
    {
        // Failed, so add the error message.
        samStatus.addError(record.getStatus());
    }
}
Ejemplo n.º 8
0
void ClipOverlap::cleanupMateMap(MateMapByCoord& mateMap,
                                 SamCoordOutput* outputBufferPtr,
                                 int32_t chrom, int32_t position)
{
    // Cleanup any reads in the mateMap whose mates are prior to the position
    // currently being processed in the file.  It means the mate was not found 
    // as expected.  Stop cleaning up once one is found that is not passed.
    uint64_t chromPos = 0;
    if((chrom != -1) && (position != -1))

    {
        chromPos = SamHelper::combineChromPos(chrom, position);
    }
    else
    {
        chrom = -1;
    }
    
    // Stop after the first read is found whose mate has not yet been reached.
    SamRecord* firstRec = mateMap.first();
    while(firstRec != NULL)
    {
        uint64_t firstMateChromPos = 
            SamHelper::combineChromPos(firstRec->getMateReferenceID(),
                                       firstRec->get0BasedMatePosition());
        if((firstMateChromPos < chromPos) || (chrom == -1))
        {
            // Already past the mate's position, so note this read and
            // write it.
            ++myNumMateFailures;
            if((outputBufferPtr != NULL) && !myOverlapsOnly)
            {
                outputBufferPtr->add(firstRec);
            }
            else
            {
                myPool.releaseRecord(firstRec);
            }
            // Remove this record.
            mateMap.popFirst();
            // Get the next record to check.
            firstRec = mateMap.first();
        }
        else
        {
            // The first record's mate position has not yet been passed, so
            // stop cleaning up the buffer.
            break;
        }
    }
}
Ejemplo n.º 9
0
// Finds the total base quality of a read
int Dedup_LowMem::getBaseQuality(SamRecord & record) {
    const char* baseQualities = record.getQuality();
    int readLength = record.getReadLength();
    int quality = 0.;
    if(strcmp(baseQualities, "*") == 0)
    {
        return(0);
    }
    for(int i=0; i < readLength; ++i) {
        int q = static_cast<int>(baseQualities[i])-33;
        if ( q >= myMinQual ) quality += q;
    }
    return quality;
}
Ejemplo n.º 10
0
void SamFilter::filterRead(SamRecord& record)
{
    // Filter the read by marking it as unmapped.
    uint16_t flag = record.getFlag(); 
    SamFlag::setUnmapped(flag);
    // Clear N/A flags.
    flag &= ~SamFlag::PROPER_PAIR;
    flag &= ~SamFlag::SECONDARY_ALIGNMENT;
    flag &= ~SamFlag::SUPPLEMENTARY_ALIGNMENT;
    record.setFlag(flag);
    // Clear Cigar
    record.setCigar("*");
    // Clear mapping quality
    record.setMapQuality(0);
}
Ejemplo n.º 11
0
// determine whether the record's position is different from the previous record
bool Dedup_LowMem::hasPositionChanged(SamRecord& record)
{
    if (lastReference != record.getReferenceID() ||
            lastCoordinate < record.get0BasedPosition())
    {
        if (lastReference != record.getReferenceID())
        {
            lastReference = record.getReferenceID();
            Logger::gLogger->writeLog("Reading ReferenceID %d\n", lastReference);
        }
        lastCoordinate = record.get0BasedPosition();
        return true;
    }
    return false;
}
Ejemplo n.º 12
0
void Bam2FastQ::writeFastQ(SamRecord& samRec, IFILE filePtr,
                             const char* readNameExt)
{
    static int16_t flag;
    static std::string sequence;
    static String quality;

    if(filePtr == NULL)
    {
        return;
    }

    flag = samRec.getFlag();
    const char* readName = samRec.getReadName();
    sequence = samRec.getSequence();
    quality = samRec.getQuality();
    
    if(SamFlag::isReverse(flag) && myReverseComp)
    {
        // It is reverse, so reverse compliment the sequence
        BaseUtilities::reverseComplement(sequence);
        // Reverse the quality.
        quality.Reverse();
    }
    else
    {
        // Ensure it is all capitalized.
        int seqLen = sequence.size();
        for (int i = 0; i < seqLen; i++)
        {
            sequence[i] = (char)toupper(sequence[i]);
        }
    }
    
    if(myRNPlus)
    {

        ifprintf(filePtr, "@%s%s\n%s\n+%s%s\n%s\n", readName, readNameExt,
                 sequence.c_str(), readName, readNameExt, quality.c_str());
    }
    else
    {
        ifprintf(filePtr, "@%s%s\n%s\n+\n%s\n", readName, readNameExt,
                 sequence.c_str(), quality.c_str());
    }
    // Release the record.
    myPool.releaseRecord(&samRec);
}
Ejemplo n.º 13
0
void testRead(SamFile &inSam)
{
    // Read the SAM Header.
    SamFileHeader samHeader;
    assert(inSam.ReadHeader(samHeader));

    validateHeader(samHeader);

    testCopyHeader(samHeader);    

    testModHeader(samHeader);

    SamRecord samRecord;
    assert(inSam.ReadRecord(samHeader, samRecord) == true);
    validateRead1(samRecord);

    // Set a new quality and get the buffer.
    samRecord.setQuality("ABCDE");
    validateRead1ModQuality(samRecord);
    //   void* buffer = samRecord.getRecordBuffer();

    assert(inSam.ReadRecord(samHeader, samRecord) == true);
    validateRead2(samRecord);

    assert(inSam.ReadRecord(samHeader, samRecord) == true);
    validateRead3(samRecord);

    assert(inSam.ReadRecord(samHeader, samRecord) == true);
    validateRead4(samRecord);

    assert(inSam.ReadRecord(samHeader, samRecord) == true);
    validateRead5(samRecord);

    assert(inSam.ReadRecord(samHeader, samRecord) == true);
    validateRead6(samRecord);

    assert(inSam.ReadRecord(samHeader, samRecord) == true);
    validateRead7(samRecord);

    assert(inSam.ReadRecord(samHeader, samRecord) == true);
    validateRead8(samRecord);

    assert(inSam.ReadRecord(samHeader, samRecord) == true);
    validateRead9(samRecord);

    assert(inSam.ReadRecord(samHeader, samRecord) == true);
    validateRead10(samRecord);
}
Ejemplo n.º 14
0
// NOTE: Only positions where the reference and read both have bases that
//       are different and not 'N' are considered mismatches.
uint32_t SamFilter::sumMismatchQuality(SamRecord& record, 
                                       GenomeSequence& refSequence,
                                       uint8_t defaultQualityInt)
{
    // Track the mismatch info.
    int mismatchQual = 0;
    int numMismatch = 0;

    SamQuerySeqWithRefIter sequenceIter(record, refSequence);

    SamSingleBaseMatchInfo baseMatchInfo;
    while(sequenceIter.getNextMatchMismatch(baseMatchInfo))
    {
        if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
        {
            // Got a mismatch, get the associated quality.
            char readQualityChar = 
                record.getQuality(baseMatchInfo.getQueryIndex());
            uint8_t readQualityInt = 
                BaseUtilities::getPhredBaseQuality(readQualityChar);
            
            if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT)
            {
                // Quality was not specified, so use the configured setting.
                readQualityInt = defaultQualityInt;
            }
            mismatchQual += readQualityInt;
            ++numMismatch;
        }
    }

    return(mismatchQual);
}
Ejemplo n.º 15
0
SamStatus::Status BamInterface::writeRecord(IFILE filePtr,
        SamFileHeader& header,
        SamRecord& record,
        SamRecord::SequenceTranslation translation)
{
    // Write the file, returning the status.
    return(record.writeRecordBuffer(filePtr, translation));
}
Ejemplo n.º 16
0
void testAddHeaderAndTagToFile(const char* inputName, const char* outputName)
{
    SamFile inSam, outSam;
    assert(inSam.OpenForRead(inputName));
    assert(outSam.OpenForWrite(outputName));

    // Read the SAM Header.
    SamFileHeader samHeader;
    assert(inSam.ReadHeader(samHeader));

    // Add a header line.
    assert(samHeader.addHeaderLine("@RG\tID:myID\tSM:mySM") == false);
    assert(samHeader.addHeaderLine("@RG\tID:myID3\tSM:mySM") == true);

    // Write Header
    assert(outSam.WriteHeader(samHeader));

    SamRecord samRecord;
    assert(inSam.ReadRecord(samHeader, samRecord));
    //   validateRead1(samRecord);
    // Add two tags.
    assert(samRecord.addIntTag("XA", 123));
    assert(samRecord.addIntTag("XA", 456));
    assert(samRecord.addTag("RR", 'Z', "myID1"));
    assert(samRecord.addTag("RR", 'Z', "myID2"));

    // Write as Sam.
    assert(outSam.WriteRecord(samHeader, samRecord));

    // TODO, add test to verify it was written correctly.

    // Read a couple of records to make sure it properly can read them even
    // if they are bigger than the original.
    assert(inSam.ReadRecord(samHeader, samRecord));
    assert(inSam.ReadRecord(samHeader, samRecord));

    //  Check the MD tag, which requires the reference.
    GenomeSequence reference("testFiles/chr1_partial.fa");
    assert(SamTags::isMDTagCorrect(samRecord, reference) == false);
    String newMDTag;
    SamTags::createMDTag(newMDTag, samRecord, reference);
    assert(newMDTag == "2T1N0");
    assert(SamTags::updateMDTag(samRecord, reference));
    // Write as Sam.
    assert(outSam.WriteRecord(samHeader, samRecord));
}
Ejemplo n.º 17
0
bool ClipOverlap::flushOutputBuffer(MateMapByCoord& mateMap,
                                    SamCoordOutput& outputBuffer,
                                    int32_t prevChrom,
                                    int32_t prevPos)
{
    // We will flush the output buffer up to the first record left in the
    // mateMap.  If there are no records left in the mate map, then we
    // flush everything up to the previous chrom/pos that was processed since
    // any new records will have a higher coordinate.
    SamRecord* firstRec = mateMap.first();
    if(firstRec != NULL)
    {
        return(outputBuffer.flush(firstRec->getReferenceID(), 
                                  firstRec->get0BasedPosition()));
    }
    // Otherwise, flush based on the previous 
    return(outputBuffer.flush(prevChrom, prevPos));
}
Ejemplo n.º 18
0
// given mapping start position
// if anchor on left
//		expected_breakp = position + avr_ins_size / 2;
// if anchor on right
//		expected_breakp = position + read_length - avr_ins_size / 2;
// key = round( expected_breakp / WIN )
int Sites::getEstimatedBreakPoint( SamRecord & rec )
{
	int ep;
	int clen = GetMaxClipLen(rec);

	if ( !rec.getFlag() & 0x10 ) { // left anchor
		if (clen < -MIN_CLIP/2) // end clip of anchor decides position
			ep = rec.get1BasedAlignmentEnd();
		else
			ep = rec.get1BasedPosition() + avr_ins_size / 3;
	}
	else { // right anchor
		if (clen > MIN_CLIP/2)
			ep = rec.get1BasedPosition();
		else
			ep = rec.get1BasedPosition() + avr_read_length - avr_ins_size / 3;
	}
	return ep;
}
Ejemplo n.º 19
0
bool Revert::updateQual(SamRecord& samRecord)
{
        // Get the OQ tag, which is a string.
    const String* oldQual = samRecord.getStringTag(SamTags::ORIG_QUAL_TAG);

    bool status = true;
    if(oldQual != NULL)
    {
        // The old quality was found, so set it in the record.
        status &= samRecord.setQuality((*oldQual).c_str());

        if(!myKeepTags)
        {
            // Remove the tag.
            samRecord.rmTag(SamTags::ORIG_QUAL_TAG, SamTags::ORIG_QUAL_TAG_TYPE);
        }
    }
    return(status);
}
Ejemplo n.º 20
0
int getMaxClipLen( SamRecord & sam_rec )
{
	Cigar * myCigar = sam_rec.getCigarInfo();
	int begin_clip = myCigar->getNumBeginClips();
	int end_clip = myCigar->getNumEndClips();
	if (begin_clip >= end_clip)
		return begin_clip;
	else
		return -end_clip;
}
Ejemplo n.º 21
0
bool Revert::updateCigar(SamRecord& samRecord)
{
    // Get the OC tag, which is a string.
    const String* oldCigar = samRecord.getStringTag(SamTags::ORIG_CIGAR_TAG);
    // Get the OP tag, which is an integer.
    int* oldPos = samRecord.getIntegerTag(SamTags::ORIG_POS_TAG);

    bool status = true;
    if(oldCigar != NULL)
    {
        // The old cigar was found, so set it in the record.
        status &= samRecord.setCigar((*oldCigar).c_str());

        if(!myKeepTags)
        {
            // Remove the tag.
            status &= samRecord.rmTag(SamTags::ORIG_CIGAR_TAG, SamTags::ORIG_CIGAR_TAG_TYPE);
        }
    }
    if(oldPos != NULL)
    {
        // The old position was found, so set it in the record.
        status &= samRecord.set1BasedPosition(*oldPos);

        if(!myKeepTags)
        {
            // Remove the tag.
            status &= samRecord.rmTag(SamTags::ORIG_POS_TAG, SamTags::ORIG_POS_TAG_TYPE);
        }
    }

    return(status);
}
Ejemplo n.º 22
0
// get the libraryID of a record
uint32_t Dedup_LowMem::getLibraryID(SamRecord& record, bool checkTags) {
    if ( ( checkTags == false ) && ( numLibraries <= 1 ) ) {
        return 0;
    } else {
        char tag[3];
        char vtype;
        void* value;
        std::string rgID;
        record.resetTagIter();
        while( record.getNextSamTag(tag,vtype,&value) != false ) {
            if ( ( tag[0] == 'R' ) && ( tag[1] == 'G' ) && ( vtype == 'Z' ) ) {
                if ( !rgID.empty() ) {
                    Logger::gLogger->error("Multiple RG tag found in one record. ReadName is %s",record.getReadName());
                }
                else if ( record.isStringType(vtype) ) {
                    String s = (String)*(String*)value;
                    rgID = s.c_str();
                }
                else {
                    Logger::gLogger->error("vtype is not string (Z) for RG tag");
                }
            }
        }
        if ( rgID.empty() ) {
            Logger::gLogger->error("No RG tag is found in read %s",record.getReadName());
            return 0;
        }
        else {
            std::map<std::string,uint32_t>::iterator it = rgidLibMap.find(rgID);
            if ( it != rgidLibMap.end() ) {
                return it->second;
            }
            else {
                Logger::gLogger->warning("RG tag %s does not exist in the header",rgID.c_str());
                return 0; // cannot be reached
            }
        }
    }
}
Ejemplo n.º 23
0
void GenomeRegionSeqStats::CalcClusters(String &bamFile, int minMapQuality)
{
  SamFile sam;
  SamRecord samRecord;
  SamFileHeader samHeader;

  if(!sam.OpenForRead(bamFile.c_str()))
    error("Open BAM file %s failed!\n", bamFile.c_str());

  if(!sam.ReadHeader(samHeader)) {
      error("Read BAM file header %s failed!\n", bamFile.c_str());
  }
  
  if(depth.size()==0) depth.resize(referencegenome.sequenceLength());
  
  String contigLabel;
  uint32_t start;
  uint32_t gstart;
  Reset();
  while(sam.ReadRecord(samHeader, samRecord))
    {
      nReads++;
      if(samRecord.getFlag() & SamFlag::UNMAPPED) nUnMapped++;

      if(samRecord.getMapQuality() < minMapQuality) continue;

      CigarRoller cigar(samRecord.getCigar());

      int nonClipSequence = 0;

      if(cigar.size()!=0 && cigar[0].operation==Cigar::softClip)
          nonClipSequence = cigar[0].count;

      contigLabel = samRecord.getReferenceName();
      start = nonClipSequence + samRecord.get0BasedPosition();  // start is 0-based

      gstart = referencegenome.getGenomePosition(contigLabel.c_str(), start);

      if(IsInRegions(contigLabel, start, start+samRecord.getReadLength())) continue;

      for(uint32_t i=gstart; i<gstart+samRecord.getReadLength(); i++)
       if(depth[i]<MAXDP)
        depth[i]++;
      nMappedOutTargets++;
    }
}
Ejemplo n.º 24
0
// Write a record to the currently opened file.
bool SamFile::WriteRecord(SamFileHeader& header, 
                          SamRecord& record)
{
    if(myIsOpenForWrite == false)
    {
        // File is not open for writing
        myStatus.setStatus(SamStatus::FAIL_ORDER, 
                           "Cannot write record since the file is not open for writing");
        return(false);
    }

    if(myHasHeader == false)
    {
        // The header has not yet been written.
        myStatus.setStatus(SamStatus::FAIL_ORDER, 
                           "Cannot write record since the header has not been written");
        return(false);
    }

    // Before trying to write the record, validate the sort order.
    if(!validateSortOrder(record, header))
    {
        // Not sorted like it is supposed to be, do not write the record
        myStatus.setStatus(SamStatus::INVALID_SORT, 
                           "Cannot write the record since the file is not properly sorted.");
        return(false);
    }

    if(myRefPtr != NULL)
    {
        record.setReference(myRefPtr);
    }

    // File is open for writing and the header has been written, so write the
    // record.
    myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record,
                                           myWriteTranslation);

    if(myStatus == SamStatus::SUCCESS)
    {
        // A record was successfully written, so increment the record count.
        myRecordCount++;
        return(true);
    }
    return(false);
}
Ejemplo n.º 25
0
bool Demux::verify(){ //at least one decoy should be present
	BamReader bamReader(bamFilePath);
	SamRecord samRecord;
	int verifiedDecoys = 0;
	while (	bamReader.getNextRecord(samRecord)) {
		string recordName(samRecord.getReadName());
		recordName = cHandler.decrypt(recordName);
		//clean record name
		int len=recordName.find("$");
		recordName=recordName.substr(0,len);
		//clean ended
		if (isDecoy(recordName)){
			char decoyChromosome[2][20];
			int decoyLocation[2];
			int decoyJobID;

			sscanf(	recordName.c_str(),"DECOY.%[^.].%d_%[^.].%d.%d",
					decoyChromosome[0], decoyLocation, 
					decoyChromosome[1], decoyLocation + 1,  &decoyJobID);

			int pair = 	!SamFlag::isFirstFragment(samRecord.getFlag());
			if (strcmp(decoyChromosome[pair],samRecord.getReferenceName())){
				cout << "Chromosome mismatch\n";
				cout << "Expected: " << decoyChromosome[pair] << endl;
				cout << "Got: " << samRecord.getReferenceName() << endl;
				return false;
			}
			if ( decoyLocation[pair] != samRecord.get0BasedPosition()){
				cout << "Position mismatch\n";
				cout << "Expected: " << decoyLocation[pair] << endl;
				cout << "Got: " << samRecord.get0BasedPosition() << endl;
				return false;
			}
			if (decoyJobID != jobID){
				cout << "Job ID mismatch\n";
				cout << "Expected: " << decoyJobID << endl;
				cout << "Got: " << jobID << endl;
				return false;
			}
			verifiedDecoys++;
		}
    }
    if (verifiedDecoys)
		return true;
	else
		return false;
}
Ejemplo n.º 26
0
void Sites::addToCurrentCluster( vector<bool> & is_in_coord, SingleSite & new_site, SamRecord & rec )
{
	if (is_in_coord.size() != NMEI)
		morphError("[Sites::setNewCluster] is_in_coord size error");

	// update breakpoint
	int old_evi = new_site.evidence;
	float a1 = (float)1 / float(old_evi+1);
	int ep = getEstimatedBreakPoint(rec);
	new_site.breakp = round( a1 * (float)ep + (float)new_site.breakp * (1-a1));
	new_site.evidence++; 

	// update position
	if (rec.get1BasedPosition() < new_site.start)
		new_site.start = rec.get1BasedPosition();
	else if (rec.get1BasedAlignmentEnd() > new_site.end)
		new_site.end = rec.get1BasedAlignmentEnd();

	// update info
	if (rec.getFlag() & 0x10) {
		if (new_site.right_clip_only) {
			Cigar * myCigar = rec.getCigarInfo();
			int begin_clip = myCigar->getNumBeginClips();
			if ( begin_clip < MIN_CLIP/2)
				new_site.right_clip_only = 0;	
		}
		for(int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.right[m]++;
		}	
	}
	else {
		if (new_site.left_clip_only) {
			Cigar * myCigar = rec.getCigarInfo();
			int end_clip = myCigar->getNumEndClips();
			if (end_clip < MIN_CLIP/2)
				new_site.left_clip_only = 0;			
		}
		for( int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.left[m]++;
		}
	}
}
Ejemplo n.º 27
0
void Sites::setNewCluster( vector<bool> & is_in_coord, SingleSite & new_site, SamRecord & rec )
{
	if (is_in_coord.size() != NMEI)
		morphError("[Sites::setNewCluster] is_in_coord size error");

	// set info
	new_site.breakp = getEstimatedBreakPoint(rec);
	new_site.rcount = 1;
	new_site.evidence = 1;
	for(int m=0; m<NMEI; m++) {
		new_site.left[m] = 0;
		new_site.right[m] = 0;
	}
	new_site.left_clip_only = 1;
	new_site.right_clip_only = 1;
	new_site.depth = current_depth;
	new_site.depth_add = 1;

	// set position & mtype
	if ( rec.getFlag() & 0x10 )  { // right anchor
		new_site.start = rec.get1BasedPosition();
		new_site.end = rec.get1BasedAlignmentEnd();
		Cigar * myCigar = rec.getCigarInfo();
		int begin_clip = myCigar->getNumBeginClips();
		if ( begin_clip < MIN_CLIP/2)
			new_site.right_clip_only = 0;
		for(int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.right[m] = 1;
		}
	}
	else {
		new_site.start = rec.get1BasedPosition();
		new_site.end = rec.get1BasedAlignmentEnd();
		Cigar * myCigar = rec.getCigarInfo();
		int end_clip = myCigar->getNumEndClips();
		if (end_clip < MIN_CLIP/2)
			new_site.left_clip_only = 0;
		for(int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.left[m] = 1;
		}
	}	
}
Ejemplo n.º 28
0
int Likelihood::getFlankingCount( string & chr, int pos )
{
	bool status = bam.SetReadSection( chr.c_str(), pos - avr_read_length * 2, pos + avr_read_length/2 );
	if (!status)
		return 0;

	int n = 0;
	SamRecord rec;
	while(bam.ReadRecord(bam_header, rec)) {
		if ( !(rec.getFlag() & 0x2) )
			continue;
		if (IsSupplementary(rec.getFlag()))
			continue;
		if (rec.getReadLength() < avr_read_length / 3)
			continue;
		if (rec.getMapQuality() < MIN_QUALITY)
			continue;
		if (rec.get1BasedPosition() + MIN_CLIP/2 <= pos && rec.get1BasedAlignmentEnd() - MIN_CLIP/2 >= pos )
			n++;
	}

	return n;
}
Ejemplo n.º 29
0
void GenomeRegionSeqStats::CalcRegionStats(String &bamFile)
{
  SamFile sam;
  SamRecord samRecord;
  SamFileHeader samHeader;

  if(!sam.OpenForRead(bamFile.c_str()))
    error("Open BAM file %s failed!\n", bamFile.c_str());

  if(!sam.ReadHeader(samHeader)) {
      error("Read BAM file header %s failed!\n", bamFile.c_str());
  }
  
  String contigLabel;
  int start, end;
  Reset();
  while(sam.ReadRecord(samHeader, samRecord))
    {
      nReads++;
      if(samRecord.getFlag() & SamFlag::UNMAPPED) nUnMapped++;

      if(contigFinishedCnt>=contigs.size()) continue;

      CigarRoller cigar(samRecord.getCigar());

      int nonClipSequence = 0;

      if(cigar.size()!=0 && cigar[0].operation==Cigar::softClip)
          nonClipSequence = cigar[0].count;

      contigLabel = samRecord.getReferenceName();
      start = nonClipSequence + samRecord.get0BasedPosition();  // start is 0-based
      end = start + samRecord.getReadLength() - 1;
      if(UpdateRegionStats(contigLabel, start, end)) nMapped2Targets++;
    }
    CalcRegionReadCountInGCBins();
    CalcGroupReadCountInGCBins();
    std::cout << "Total reads : " << nReads << std::endl;
}
Ejemplo n.º 30
0
int Dedup_LowMem::execute(int argc, char** argv)
{
    /* --------------------------------
     * process the arguments
     * -------------------------------*/
    String inFile, outFile, logFile;
    myDoRecab = false;
    bool removeFlag = false;
    bool verboseFlag = false;
    myForceFlag = false;
    myNumMissingMate = 0;
    myMinQual = DEFAULT_MIN_QUAL;
    String excludeFlags = "0xB04";
    uint16_t intExcludeFlags = 0;
    bool noeof = false;
    bool params = false;

    LongParamContainer parameters;
    parameters.addGroup("Required Parameters");
    parameters.addString("in", &inFile);
    parameters.addString("out", &outFile);
    parameters.addGroup("Optional Parameters");
    parameters.addInt("minQual", & myMinQual);
    parameters.addString("log", &logFile);
    parameters.addBool("oneChrom", &myOneChrom);
    parameters.addBool("recab", &myDoRecab);
    parameters.addBool("rmDups", &removeFlag);
    parameters.addBool("force", &myForceFlag);
    parameters.addString("excludeFlags", &excludeFlags);
    parameters.addBool("verbose", &verboseFlag);
    parameters.addBool("noeof", &noeof);
    parameters.addBool("params", &params);
    parameters.addPhoneHome(VERSION);
    myRecab.addRecabSpecificParameters(parameters);

    ParameterList inputParameters;
    inputParameters.Add(new LongParameters ("Input Parameters",
                                            parameters.getLongParameterList()));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    if(inFile.IsEmpty())
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Specify an input file" << std::endl;
        return EXIT_FAILURE;
    }

    if(outFile.IsEmpty())
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Specify an output file" << std::endl;
        return EXIT_FAILURE;
    }

    intExcludeFlags = excludeFlags.AsInteger();

    if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Cannot specify --force and Duplicate in the excludeFlags.  Since --force indicates to override"
                  << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n";
        return EXIT_FAILURE;
    }

    if(!SamFlag::isSecondary(intExcludeFlags))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n";
        return EXIT_FAILURE;
    }

    if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n";
        return EXIT_FAILURE;
    }

    if(logFile.IsEmpty())
    {
        logFile = outFile + ".log";
    }

    if(myDoRecab)
    {
        int status = myRecab.processRecabParam();
        if(status != 0)
        {
            inputParameters.Status();
            return(status);
        }
    }

    if(params)
    {
        inputParameters.Status();
    }

    Logger::gLogger = new Logger(logFile.c_str(), verboseFlag);

    /* -------------------------------------------------------------------
     * The arguments are processed.  Prepare the input BAM file,
     * instantiate dedup_LowMem, and construct the read group library map
     * ------------------------------------------------------------------*/

    SamFile samIn;

    samIn.OpenForRead(inFile.c_str());
    // If the file isn't sorted it will throw an exception.
    samIn.setSortedValidation(SamFile::COORDINATE);

    SamFileHeader header;
    samIn.ReadHeader(header);

    buildReadGroupLibraryMap(header);

    lastReference = -1;
    lastCoordinate = -1;

    // for keeping some basic statistics
    uint32_t recordCount = 0;
    uint32_t pairedCount = 0;
    uint32_t properPairCount = 0;
    uint32_t unmappedCount = 0;
    uint32_t reverseCount = 0;
    uint32_t qualCheckFailCount = 0;
    uint32_t secondaryCount = 0;
    uint32_t supplementaryCount = 0;
    uint32_t excludedCount = 0;

    // Now we start reading records
    SamRecord* recordPtr;
    SamStatus::Status returnStatus = SamStatus::SUCCESS;
    while(returnStatus == SamStatus::SUCCESS)
    {
        recordPtr = mySamPool.getRecord();
        if(recordPtr == NULL)
        {
            std::cerr << "Failed to allocate enough records\n";
            return(-1);
        }
        if(!samIn.ReadRecord(header, *recordPtr))
        {
            returnStatus = samIn.GetStatus();
            continue;
        }
        // Take note of properties of this record
        int flag = recordPtr->getFlag();
        if(SamFlag::isPaired(flag))     ++pairedCount;
        if(SamFlag::isProperPair(flag)) ++properPairCount;
        if(SamFlag::isReverse(flag))    ++reverseCount;
        if(SamFlag::isQCFailure(flag))  ++qualCheckFailCount;
        if(SamFlag::isSecondary(flag))  ++secondaryCount;
        if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT)  ++supplementaryCount;
        if(!SamFlag::isMapped(flag))    ++unmappedCount;

        // put the record in the appropriate maps:
        //   single reads go in myFragmentMap
        //   paired reads go in myPairedMap
        recordCount = samIn.GetCurrentRecordCount();

        // if we have moved to a new position, look back at previous reads for duplicates
        if (hasPositionChanged(*recordPtr))
        {
            cleanupPriorReads(recordPtr);
        }

        // Determine if this read should be checked for duplicates.
        if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0))
        {
            ++excludedCount;

            // No deduping done on this record, but still build the recab table.
            if(myDoRecab)
            {
                myRecab.processReadBuildTable(*recordPtr);
            }
            // Nothing more to do with this record, so
            // release the pointer.
            mySamPool.releaseRecord(recordPtr);
        }
        else
        {
            if(SamFlag::isDuplicate(flag) && !myForceFlag)
            {
                // Error: Marked duplicates, and duplicates aren't excluded.
                Logger::gLogger->error("There are records already duplicate marked.");
                Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over");
            }

            checkDups(*recordPtr, recordCount);
            mySamPool.releaseRecord(recordPtr);
        }
        // let the user know we're not napping
        if (verboseFlag && (recordCount % 100000 == 0))
        {
            Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u",
                                      recordCount, myFragmentMap.size(),
                                      myPairedMap.size(),
                                      myMateMap.size());
        }
    }

    // we're finished reading record so clean up the duplicate search and
    //  close the input file
    cleanupPriorReads(NULL);
    samIn.Close();

    // print some statistics
    Logger::gLogger->writeLog("--------------------------------------------------------------------------");
    Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS");
    Logger::gLogger->writeLog("Total number of reads: %u",recordCount);
    Logger::gLogger->writeLog("Total number of paired-end reads: %u",
                              pairedCount);
    Logger::gLogger->writeLog("Total number of properly paired reads: %u",
                              properPairCount);
    Logger::gLogger->writeLog("Total number of unmapped reads: %u",
                              unmappedCount);
    Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u",
                              reverseCount);
    Logger::gLogger->writeLog("Total number of QC-failed reads: %u",
                              qualCheckFailCount);
    Logger::gLogger->writeLog("Total number of secondary reads: %u",
                              secondaryCount);
    Logger::gLogger->writeLog("Total number of supplementary reads: %u",
                              supplementaryCount);
    Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u",
                              myFragmentMap.size());
    Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u",
                              myPairedMap.size());
    Logger::gLogger->writeLog("Total number of missing mates: %u",
                              myNumMissingMate);
    Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u",
                              excludedCount);
    Logger::gLogger->writeLog("--------------------------------------------------------------------------");
    Logger::gLogger->writeLog("Sorting the indices of %d duplicated records",
                              myDupList.size());

    // sort the indices of duplicate records
    std::sort(myDupList.begin(), myDupList.end(),
              std::less<uint32_t> ());

    // get ready to write the output file by making a second pass
    // through the input file
    samIn.OpenForRead(inFile.c_str());
    samIn.ReadHeader(header);

    SamFile samOut;
    samOut.OpenForWrite(outFile.c_str());
    samOut.WriteHeader(header);

    // If we are recalibrating, output the model information.
    if(myDoRecab)
    {
        myRecab.modelFitPrediction(outFile);
    }

    // an iterator to run through the duplicate indices
    int currentDupIndex = 0;
    bool moreDups = !myDupList.empty();

    // let the user know what we're doing
    Logger::gLogger->writeLog("\nWriting %s", outFile.c_str());

    // count the duplicate records as a check
    uint32_t singleDuplicates(0), pairedDuplicates(0);

    // start reading records and writing them out
    SamRecord record;
    while(samIn.ReadRecord(header, record))
    {
        uint32_t currentIndex = samIn.GetCurrentRecordCount();

        bool foundDup = moreDups &&
                        (currentIndex == myDupList[currentDupIndex]);

        // modify the duplicate flag and write out the record,
        // if it's appropriate
        int flag = record.getFlag();
        if (foundDup)
        {
            // this record is a duplicate, so mark it.
            record.setFlag( flag | 0x400 );
            currentDupIndex++;
            // increment duplicate counters to verify we found them all
            if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) )
            {   // unpaired or mate unmapped
                singleDuplicates++;
            }
            else
            {
                pairedDuplicates++;
            }
            // recalibrate if necessary.
            if(myDoRecab)
            {
                myRecab.processReadApplyTable(record);
            }

            // write the record if we are not removing duplicates
            if (!removeFlag ) samOut.WriteRecord(header, record);
        }
        else
        {
            if(myForceFlag)
            {
                // this is not a duplicate we've identified but we want to
                // remove any duplicate marking
                record.setFlag( flag & 0xfffffbff ); // unmark duplicate
            }
            // Not a duplicate, so recalibrate if necessary.
            if(myDoRecab)
            {
                myRecab.processReadApplyTable(record);
            }
            samOut.WriteRecord(header, record);
        }

        // Let the user know we're still here
        if (verboseFlag && (currentIndex % 100000 == 0)) {
            Logger::gLogger->writeLog("recordCount=%u", currentIndex);
        }
    }

    // We're done.  Close the files and print triumphant messages.
    samIn.Close();
    samOut.Close();

    Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads",
                              removeFlag ? "removed" : "marked" ,
                              singleDuplicates,
                              pairedDuplicates/2);
    Logger::gLogger->writeLog("\nDedup_LowMem complete!");
    return 0;
}