// Soft clip the record from the front and/or the back. SamFilter::FilterStatus SamFilter::softClip(SamRecord& record, int32_t numFrontClips, int32_t numBackClips) { ////////////////////////////////////////////////////////// Cigar* cigar = record.getCigarInfo(); FilterStatus status = NONE; int32_t startPos = record.get0BasedPosition(); CigarRoller updatedCigar; status = softClip(*cigar, numFrontClips, numBackClips, startPos, updatedCigar); if(status == FILTERED) { ///////////////////////////// // The entire read is clipped, so rather than clipping it, // filter it out. filterRead(record); return(FILTERED); } else if(status == CLIPPED) { // Part of the read was clipped, and now that we have // an updated cigar, update the read. record.setCigar(updatedCigar); // Update the starting position. record.set0BasedPosition(startPos); } return(status); }
bool SamFile::checkRecordInSection(SamRecord& record) { bool recordFound = true; if(myRefID == BamIndex::REF_ID_ALL) { return(true); } // Check to see if it is in the correct reference/position. if(record.getReferenceID() != myRefID) { // Incorrect reference ID, return no more records. myStatus = SamStatus::NO_MORE_RECS; return(false); } // Found a record. recordFound = true; // If start/end position are set, verify that the alignment falls // within those. // If the alignment start is greater than the end of the region, // return NO_MORE_RECS. // Since myEndPos is Exclusive 0-based, anything >= myEndPos is outside // of the region. if((myEndPos != -1) && (record.get0BasedPosition() >= myEndPos)) { myStatus = SamStatus::NO_MORE_RECS; return(false); } // We know the start is less than the end position, so the alignment // overlaps the region if the alignment end position is greater than the // start of the region. if((myStartPos != -1) && (record.get0BasedAlignmentEnd() < myStartPos)) { // If it does not overlap the region, so go to the next // record...set recordFound back to false. recordFound = false; } if(!myOverlapSection) { // Needs to be fully contained. Not fully contained if // 1) the record start position is < the region start position. // or // 2) the end position is specified and the record end position // is greater than or equal to the region end position. // (equal to since the region is exclusive. if((record.get0BasedPosition() < myStartPos) || ((myEndPos != -1) && (record.get0BasedAlignmentEnd() >= myEndPos))) { // This record is not fully contained, so move on to the next // record. recordFound = false; } } return(recordFound); }
void Demux::match(){ BamReader bamReader(bamFilePath); SamRecord samRecord; while ( bamReader.getNextRecord(samRecord)) { string recordName(samRecord.getReadName()); recordName = cHandler.decrypt(recordName); //clean record name int len=recordName.find("$"); recordName=recordName.substr(0,len); //clean ended if (!isDecoy(recordName)){ string outputFile = generateFileName(recordName); printf("%s\n",outputFile.c_str()); if (writers.find(outputFile) == writers.end()) //if the BamWriter is not initialized writers[outputFile] = new BamWriter(outputFile, bamReader.getHeader()); BamWriter *writer = writers[outputFile]; writer->writeRecord(samRecord); } } for(auto it=writers.begin();it!=writers.end();it++){ BamWriter *writer = it->second; writer->close(); delete writer; } }
void Bam2FastQ::handlePairedRN(SamRecord& samRec) { static SamRecord* prevRec = NULL; static std::string prevRN = ""; if(prevRec == NULL) { prevRec = &samRec; } else { if(strcmp(prevRec->getReadName(), samRec.getReadName()) != 0) { // Read Name does not match, error, did not find pair. std::cerr << "Paired Read, " << prevRec->getReadName() << " but couldn't find mate, so writing as " << "unpaired (single-ended)\n"; ++myNumMateFailures; writeFastQ(*prevRec, myUnpairedFile, myUnpairedFileNameExt); // Save this record to check against the next one. prevRec = &samRec; } else { // Matching ReadNames. // Found the mate. ++myNumPairs; // Check which is the first in the pair. if(SamFlag::isFirstFragment(samRec.getFlag())) { if(SamFlag::isFirstFragment(prevRec->getFlag())) { std::cerr << "Both reads of " << samRec.getReadName() << " are first fragment, so " << "splitting one to be in the 2nd fastq.\n"; } writeFastQ(samRec, myFirstFile, myFirstFileNameExt, myFirstRNExt.c_str()); writeFastQ(*prevRec, mySecondFile, mySecondFileNameExt, mySecondRNExt.c_str()); } else { if(!SamFlag::isFirstFragment(prevRec->getFlag())) { std::cerr << "Neither read of " << samRec.getReadName() << " are first fragment, so " << "splitting one to be in the 2nd fastq.\n"; } writeFastQ(*prevRec, myFirstFile, myFirstFileNameExt, myFirstRNExt.c_str()); writeFastQ(samRec, mySecondFile, mySecondFileNameExt, mySecondRNExt.c_str()); } // No previous record. prevRec = NULL; } } }
// make a 64-bit genomic coordinate [24bit-chr][32bit-pos][8bit-orientation] uint64_t getGenomicCoordinate(SamRecord& r) { // 64bit string consisting of // 24bit refID, 32bit pos, 8bit orientation if ( ( r.getReferenceID() < 0 ) || ( r.get0BasedPosition() < 0 ) ) { return UNMAPPED_GENOMIC_COORDINATE; } else { return ( ( static_cast<uint64_t>(r.getReferenceID()) << 40 ) | ( static_cast<uint64_t>(r.get0BasedPosition()) << 8 ) | static_cast<uint64_t>( r.getFlag() & 0x0010 ) ); } }
// Returns the number of bases in the passed in read that overlap the // region that is currently set. uint32_t SamFile::GetNumOverlaps(SamRecord& samRecord) { if(myRefPtr != NULL) { samRecord.setReference(myRefPtr); } samRecord.setSequenceTranslation(myReadTranslation); // Get the overlaps in the sam record for the region currently set // for this file. return(samRecord.getNumOverlaps(myStartPos, myEndPos)); }
void BamInterface::readRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamStatus& samStatus) { // TODO - need to validate there are @SQ lines in both sam/bam - MAYBE! // SetBufferFromFile will reset the record prior to reading a new one. if(record.setBufferFromFile(filePtr, header) != SamStatus::SUCCESS) { // Failed, so add the error message. samStatus.addError(record.getStatus()); } }
void ClipOverlap::cleanupMateMap(MateMapByCoord& mateMap, SamCoordOutput* outputBufferPtr, int32_t chrom, int32_t position) { // Cleanup any reads in the mateMap whose mates are prior to the position // currently being processed in the file. It means the mate was not found // as expected. Stop cleaning up once one is found that is not passed. uint64_t chromPos = 0; if((chrom != -1) && (position != -1)) { chromPos = SamHelper::combineChromPos(chrom, position); } else { chrom = -1; } // Stop after the first read is found whose mate has not yet been reached. SamRecord* firstRec = mateMap.first(); while(firstRec != NULL) { uint64_t firstMateChromPos = SamHelper::combineChromPos(firstRec->getMateReferenceID(), firstRec->get0BasedMatePosition()); if((firstMateChromPos < chromPos) || (chrom == -1)) { // Already past the mate's position, so note this read and // write it. ++myNumMateFailures; if((outputBufferPtr != NULL) && !myOverlapsOnly) { outputBufferPtr->add(firstRec); } else { myPool.releaseRecord(firstRec); } // Remove this record. mateMap.popFirst(); // Get the next record to check. firstRec = mateMap.first(); } else { // The first record's mate position has not yet been passed, so // stop cleaning up the buffer. break; } } }
// Finds the total base quality of a read int Dedup_LowMem::getBaseQuality(SamRecord & record) { const char* baseQualities = record.getQuality(); int readLength = record.getReadLength(); int quality = 0.; if(strcmp(baseQualities, "*") == 0) { return(0); } for(int i=0; i < readLength; ++i) { int q = static_cast<int>(baseQualities[i])-33; if ( q >= myMinQual ) quality += q; } return quality; }
void SamFilter::filterRead(SamRecord& record) { // Filter the read by marking it as unmapped. uint16_t flag = record.getFlag(); SamFlag::setUnmapped(flag); // Clear N/A flags. flag &= ~SamFlag::PROPER_PAIR; flag &= ~SamFlag::SECONDARY_ALIGNMENT; flag &= ~SamFlag::SUPPLEMENTARY_ALIGNMENT; record.setFlag(flag); // Clear Cigar record.setCigar("*"); // Clear mapping quality record.setMapQuality(0); }
// determine whether the record's position is different from the previous record bool Dedup_LowMem::hasPositionChanged(SamRecord& record) { if (lastReference != record.getReferenceID() || lastCoordinate < record.get0BasedPosition()) { if (lastReference != record.getReferenceID()) { lastReference = record.getReferenceID(); Logger::gLogger->writeLog("Reading ReferenceID %d\n", lastReference); } lastCoordinate = record.get0BasedPosition(); return true; } return false; }
void Bam2FastQ::writeFastQ(SamRecord& samRec, IFILE filePtr, const char* readNameExt) { static int16_t flag; static std::string sequence; static String quality; if(filePtr == NULL) { return; } flag = samRec.getFlag(); const char* readName = samRec.getReadName(); sequence = samRec.getSequence(); quality = samRec.getQuality(); if(SamFlag::isReverse(flag) && myReverseComp) { // It is reverse, so reverse compliment the sequence BaseUtilities::reverseComplement(sequence); // Reverse the quality. quality.Reverse(); } else { // Ensure it is all capitalized. int seqLen = sequence.size(); for (int i = 0; i < seqLen; i++) { sequence[i] = (char)toupper(sequence[i]); } } if(myRNPlus) { ifprintf(filePtr, "@%s%s\n%s\n+%s%s\n%s\n", readName, readNameExt, sequence.c_str(), readName, readNameExt, quality.c_str()); } else { ifprintf(filePtr, "@%s%s\n%s\n+\n%s\n", readName, readNameExt, sequence.c_str(), quality.c_str()); } // Release the record. myPool.releaseRecord(&samRec); }
void testRead(SamFile &inSam) { // Read the SAM Header. SamFileHeader samHeader; assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); testCopyHeader(samHeader); testModHeader(samHeader); SamRecord samRecord; assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); // Set a new quality and get the buffer. samRecord.setQuality("ABCDE"); validateRead1ModQuality(samRecord); // void* buffer = samRecord.getRecordBuffer(); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead3(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead4(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead5(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead6(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead7(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead9(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); }
// NOTE: Only positions where the reference and read both have bases that // are different and not 'N' are considered mismatches. uint32_t SamFilter::sumMismatchQuality(SamRecord& record, GenomeSequence& refSequence, uint8_t defaultQualityInt) { // Track the mismatch info. int mismatchQual = 0; int numMismatch = 0; SamQuerySeqWithRefIter sequenceIter(record, refSequence); SamSingleBaseMatchInfo baseMatchInfo; while(sequenceIter.getNextMatchMismatch(baseMatchInfo)) { if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) { // Got a mismatch, get the associated quality. char readQualityChar = record.getQuality(baseMatchInfo.getQueryIndex()); uint8_t readQualityInt = BaseUtilities::getPhredBaseQuality(readQualityChar); if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT) { // Quality was not specified, so use the configured setting. readQualityInt = defaultQualityInt; } mismatchQual += readQualityInt; ++numMismatch; } } return(mismatchQual); }
SamStatus::Status BamInterface::writeRecord(IFILE filePtr, SamFileHeader& header, SamRecord& record, SamRecord::SequenceTranslation translation) { // Write the file, returning the status. return(record.writeRecordBuffer(filePtr, translation)); }
void testAddHeaderAndTagToFile(const char* inputName, const char* outputName) { SamFile inSam, outSam; assert(inSam.OpenForRead(inputName)); assert(outSam.OpenForWrite(outputName)); // Read the SAM Header. SamFileHeader samHeader; assert(inSam.ReadHeader(samHeader)); // Add a header line. assert(samHeader.addHeaderLine("@RG\tID:myID\tSM:mySM") == false); assert(samHeader.addHeaderLine("@RG\tID:myID3\tSM:mySM") == true); // Write Header assert(outSam.WriteHeader(samHeader)); SamRecord samRecord; assert(inSam.ReadRecord(samHeader, samRecord)); // validateRead1(samRecord); // Add two tags. assert(samRecord.addIntTag("XA", 123)); assert(samRecord.addIntTag("XA", 456)); assert(samRecord.addTag("RR", 'Z', "myID1")); assert(samRecord.addTag("RR", 'Z', "myID2")); // Write as Sam. assert(outSam.WriteRecord(samHeader, samRecord)); // TODO, add test to verify it was written correctly. // Read a couple of records to make sure it properly can read them even // if they are bigger than the original. assert(inSam.ReadRecord(samHeader, samRecord)); assert(inSam.ReadRecord(samHeader, samRecord)); // Check the MD tag, which requires the reference. GenomeSequence reference("testFiles/chr1_partial.fa"); assert(SamTags::isMDTagCorrect(samRecord, reference) == false); String newMDTag; SamTags::createMDTag(newMDTag, samRecord, reference); assert(newMDTag == "2T1N0"); assert(SamTags::updateMDTag(samRecord, reference)); // Write as Sam. assert(outSam.WriteRecord(samHeader, samRecord)); }
bool ClipOverlap::flushOutputBuffer(MateMapByCoord& mateMap, SamCoordOutput& outputBuffer, int32_t prevChrom, int32_t prevPos) { // We will flush the output buffer up to the first record left in the // mateMap. If there are no records left in the mate map, then we // flush everything up to the previous chrom/pos that was processed since // any new records will have a higher coordinate. SamRecord* firstRec = mateMap.first(); if(firstRec != NULL) { return(outputBuffer.flush(firstRec->getReferenceID(), firstRec->get0BasedPosition())); } // Otherwise, flush based on the previous return(outputBuffer.flush(prevChrom, prevPos)); }
// given mapping start position // if anchor on left // expected_breakp = position + avr_ins_size / 2; // if anchor on right // expected_breakp = position + read_length - avr_ins_size / 2; // key = round( expected_breakp / WIN ) int Sites::getEstimatedBreakPoint( SamRecord & rec ) { int ep; int clen = GetMaxClipLen(rec); if ( !rec.getFlag() & 0x10 ) { // left anchor if (clen < -MIN_CLIP/2) // end clip of anchor decides position ep = rec.get1BasedAlignmentEnd(); else ep = rec.get1BasedPosition() + avr_ins_size / 3; } else { // right anchor if (clen > MIN_CLIP/2) ep = rec.get1BasedPosition(); else ep = rec.get1BasedPosition() + avr_read_length - avr_ins_size / 3; } return ep; }
bool Revert::updateQual(SamRecord& samRecord) { // Get the OQ tag, which is a string. const String* oldQual = samRecord.getStringTag(SamTags::ORIG_QUAL_TAG); bool status = true; if(oldQual != NULL) { // The old quality was found, so set it in the record. status &= samRecord.setQuality((*oldQual).c_str()); if(!myKeepTags) { // Remove the tag. samRecord.rmTag(SamTags::ORIG_QUAL_TAG, SamTags::ORIG_QUAL_TAG_TYPE); } } return(status); }
int getMaxClipLen( SamRecord & sam_rec ) { Cigar * myCigar = sam_rec.getCigarInfo(); int begin_clip = myCigar->getNumBeginClips(); int end_clip = myCigar->getNumEndClips(); if (begin_clip >= end_clip) return begin_clip; else return -end_clip; }
bool Revert::updateCigar(SamRecord& samRecord) { // Get the OC tag, which is a string. const String* oldCigar = samRecord.getStringTag(SamTags::ORIG_CIGAR_TAG); // Get the OP tag, which is an integer. int* oldPos = samRecord.getIntegerTag(SamTags::ORIG_POS_TAG); bool status = true; if(oldCigar != NULL) { // The old cigar was found, so set it in the record. status &= samRecord.setCigar((*oldCigar).c_str()); if(!myKeepTags) { // Remove the tag. status &= samRecord.rmTag(SamTags::ORIG_CIGAR_TAG, SamTags::ORIG_CIGAR_TAG_TYPE); } } if(oldPos != NULL) { // The old position was found, so set it in the record. status &= samRecord.set1BasedPosition(*oldPos); if(!myKeepTags) { // Remove the tag. status &= samRecord.rmTag(SamTags::ORIG_POS_TAG, SamTags::ORIG_POS_TAG_TYPE); } } return(status); }
// get the libraryID of a record uint32_t Dedup_LowMem::getLibraryID(SamRecord& record, bool checkTags) { if ( ( checkTags == false ) && ( numLibraries <= 1 ) ) { return 0; } else { char tag[3]; char vtype; void* value; std::string rgID; record.resetTagIter(); while( record.getNextSamTag(tag,vtype,&value) != false ) { if ( ( tag[0] == 'R' ) && ( tag[1] == 'G' ) && ( vtype == 'Z' ) ) { if ( !rgID.empty() ) { Logger::gLogger->error("Multiple RG tag found in one record. ReadName is %s",record.getReadName()); } else if ( record.isStringType(vtype) ) { String s = (String)*(String*)value; rgID = s.c_str(); } else { Logger::gLogger->error("vtype is not string (Z) for RG tag"); } } } if ( rgID.empty() ) { Logger::gLogger->error("No RG tag is found in read %s",record.getReadName()); return 0; } else { std::map<std::string,uint32_t>::iterator it = rgidLibMap.find(rgID); if ( it != rgidLibMap.end() ) { return it->second; } else { Logger::gLogger->warning("RG tag %s does not exist in the header",rgID.c_str()); return 0; // cannot be reached } } } }
void GenomeRegionSeqStats::CalcClusters(String &bamFile, int minMapQuality) { SamFile sam; SamRecord samRecord; SamFileHeader samHeader; if(!sam.OpenForRead(bamFile.c_str())) error("Open BAM file %s failed!\n", bamFile.c_str()); if(!sam.ReadHeader(samHeader)) { error("Read BAM file header %s failed!\n", bamFile.c_str()); } if(depth.size()==0) depth.resize(referencegenome.sequenceLength()); String contigLabel; uint32_t start; uint32_t gstart; Reset(); while(sam.ReadRecord(samHeader, samRecord)) { nReads++; if(samRecord.getFlag() & SamFlag::UNMAPPED) nUnMapped++; if(samRecord.getMapQuality() < minMapQuality) continue; CigarRoller cigar(samRecord.getCigar()); int nonClipSequence = 0; if(cigar.size()!=0 && cigar[0].operation==Cigar::softClip) nonClipSequence = cigar[0].count; contigLabel = samRecord.getReferenceName(); start = nonClipSequence + samRecord.get0BasedPosition(); // start is 0-based gstart = referencegenome.getGenomePosition(contigLabel.c_str(), start); if(IsInRegions(contigLabel, start, start+samRecord.getReadLength())) continue; for(uint32_t i=gstart; i<gstart+samRecord.getReadLength(); i++) if(depth[i]<MAXDP) depth[i]++; nMappedOutTargets++; } }
// Write a record to the currently opened file. bool SamFile::WriteRecord(SamFileHeader& header, SamRecord& record) { if(myIsOpenForWrite == false) { // File is not open for writing myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot write record since the file is not open for writing"); return(false); } if(myHasHeader == false) { // The header has not yet been written. myStatus.setStatus(SamStatus::FAIL_ORDER, "Cannot write record since the header has not been written"); return(false); } // Before trying to write the record, validate the sort order. if(!validateSortOrder(record, header)) { // Not sorted like it is supposed to be, do not write the record myStatus.setStatus(SamStatus::INVALID_SORT, "Cannot write the record since the file is not properly sorted."); return(false); } if(myRefPtr != NULL) { record.setReference(myRefPtr); } // File is open for writing and the header has been written, so write the // record. myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record, myWriteTranslation); if(myStatus == SamStatus::SUCCESS) { // A record was successfully written, so increment the record count. myRecordCount++; return(true); } return(false); }
bool Demux::verify(){ //at least one decoy should be present BamReader bamReader(bamFilePath); SamRecord samRecord; int verifiedDecoys = 0; while ( bamReader.getNextRecord(samRecord)) { string recordName(samRecord.getReadName()); recordName = cHandler.decrypt(recordName); //clean record name int len=recordName.find("$"); recordName=recordName.substr(0,len); //clean ended if (isDecoy(recordName)){ char decoyChromosome[2][20]; int decoyLocation[2]; int decoyJobID; sscanf( recordName.c_str(),"DECOY.%[^.].%d_%[^.].%d.%d", decoyChromosome[0], decoyLocation, decoyChromosome[1], decoyLocation + 1, &decoyJobID); int pair = !SamFlag::isFirstFragment(samRecord.getFlag()); if (strcmp(decoyChromosome[pair],samRecord.getReferenceName())){ cout << "Chromosome mismatch\n"; cout << "Expected: " << decoyChromosome[pair] << endl; cout << "Got: " << samRecord.getReferenceName() << endl; return false; } if ( decoyLocation[pair] != samRecord.get0BasedPosition()){ cout << "Position mismatch\n"; cout << "Expected: " << decoyLocation[pair] << endl; cout << "Got: " << samRecord.get0BasedPosition() << endl; return false; } if (decoyJobID != jobID){ cout << "Job ID mismatch\n"; cout << "Expected: " << decoyJobID << endl; cout << "Got: " << jobID << endl; return false; } verifiedDecoys++; } } if (verifiedDecoys) return true; else return false; }
void Sites::addToCurrentCluster( vector<bool> & is_in_coord, SingleSite & new_site, SamRecord & rec ) { if (is_in_coord.size() != NMEI) morphError("[Sites::setNewCluster] is_in_coord size error"); // update breakpoint int old_evi = new_site.evidence; float a1 = (float)1 / float(old_evi+1); int ep = getEstimatedBreakPoint(rec); new_site.breakp = round( a1 * (float)ep + (float)new_site.breakp * (1-a1)); new_site.evidence++; // update position if (rec.get1BasedPosition() < new_site.start) new_site.start = rec.get1BasedPosition(); else if (rec.get1BasedAlignmentEnd() > new_site.end) new_site.end = rec.get1BasedAlignmentEnd(); // update info if (rec.getFlag() & 0x10) { if (new_site.right_clip_only) { Cigar * myCigar = rec.getCigarInfo(); int begin_clip = myCigar->getNumBeginClips(); if ( begin_clip < MIN_CLIP/2) new_site.right_clip_only = 0; } for(int m=0; m<NMEI; m++) { if (is_in_coord[m]) new_site.right[m]++; } } else { if (new_site.left_clip_only) { Cigar * myCigar = rec.getCigarInfo(); int end_clip = myCigar->getNumEndClips(); if (end_clip < MIN_CLIP/2) new_site.left_clip_only = 0; } for( int m=0; m<NMEI; m++) { if (is_in_coord[m]) new_site.left[m]++; } } }
void Sites::setNewCluster( vector<bool> & is_in_coord, SingleSite & new_site, SamRecord & rec ) { if (is_in_coord.size() != NMEI) morphError("[Sites::setNewCluster] is_in_coord size error"); // set info new_site.breakp = getEstimatedBreakPoint(rec); new_site.rcount = 1; new_site.evidence = 1; for(int m=0; m<NMEI; m++) { new_site.left[m] = 0; new_site.right[m] = 0; } new_site.left_clip_only = 1; new_site.right_clip_only = 1; new_site.depth = current_depth; new_site.depth_add = 1; // set position & mtype if ( rec.getFlag() & 0x10 ) { // right anchor new_site.start = rec.get1BasedPosition(); new_site.end = rec.get1BasedAlignmentEnd(); Cigar * myCigar = rec.getCigarInfo(); int begin_clip = myCigar->getNumBeginClips(); if ( begin_clip < MIN_CLIP/2) new_site.right_clip_only = 0; for(int m=0; m<NMEI; m++) { if (is_in_coord[m]) new_site.right[m] = 1; } } else { new_site.start = rec.get1BasedPosition(); new_site.end = rec.get1BasedAlignmentEnd(); Cigar * myCigar = rec.getCigarInfo(); int end_clip = myCigar->getNumEndClips(); if (end_clip < MIN_CLIP/2) new_site.left_clip_only = 0; for(int m=0; m<NMEI; m++) { if (is_in_coord[m]) new_site.left[m] = 1; } } }
int Likelihood::getFlankingCount( string & chr, int pos ) { bool status = bam.SetReadSection( chr.c_str(), pos - avr_read_length * 2, pos + avr_read_length/2 ); if (!status) return 0; int n = 0; SamRecord rec; while(bam.ReadRecord(bam_header, rec)) { if ( !(rec.getFlag() & 0x2) ) continue; if (IsSupplementary(rec.getFlag())) continue; if (rec.getReadLength() < avr_read_length / 3) continue; if (rec.getMapQuality() < MIN_QUALITY) continue; if (rec.get1BasedPosition() + MIN_CLIP/2 <= pos && rec.get1BasedAlignmentEnd() - MIN_CLIP/2 >= pos ) n++; } return n; }
void GenomeRegionSeqStats::CalcRegionStats(String &bamFile) { SamFile sam; SamRecord samRecord; SamFileHeader samHeader; if(!sam.OpenForRead(bamFile.c_str())) error("Open BAM file %s failed!\n", bamFile.c_str()); if(!sam.ReadHeader(samHeader)) { error("Read BAM file header %s failed!\n", bamFile.c_str()); } String contigLabel; int start, end; Reset(); while(sam.ReadRecord(samHeader, samRecord)) { nReads++; if(samRecord.getFlag() & SamFlag::UNMAPPED) nUnMapped++; if(contigFinishedCnt>=contigs.size()) continue; CigarRoller cigar(samRecord.getCigar()); int nonClipSequence = 0; if(cigar.size()!=0 && cigar[0].operation==Cigar::softClip) nonClipSequence = cigar[0].count; contigLabel = samRecord.getReferenceName(); start = nonClipSequence + samRecord.get0BasedPosition(); // start is 0-based end = start + samRecord.getReadLength() - 1; if(UpdateRegionStats(contigLabel, start, end)) nMapped2Targets++; } CalcRegionReadCountInGCBins(); CalcGroupReadCountInGCBins(); std::cout << "Total reads : " << nReads << std::endl; }
int Dedup_LowMem::execute(int argc, char** argv) { /* -------------------------------- * process the arguments * -------------------------------*/ String inFile, outFile, logFile; myDoRecab = false; bool removeFlag = false; bool verboseFlag = false; myForceFlag = false; myNumMissingMate = 0; myMinQual = DEFAULT_MIN_QUAL; String excludeFlags = "0xB04"; uint16_t intExcludeFlags = 0; bool noeof = false; bool params = false; LongParamContainer parameters; parameters.addGroup("Required Parameters"); parameters.addString("in", &inFile); parameters.addString("out", &outFile); parameters.addGroup("Optional Parameters"); parameters.addInt("minQual", & myMinQual); parameters.addString("log", &logFile); parameters.addBool("oneChrom", &myOneChrom); parameters.addBool("recab", &myDoRecab); parameters.addBool("rmDups", &removeFlag); parameters.addBool("force", &myForceFlag); parameters.addString("excludeFlags", &excludeFlags); parameters.addBool("verbose", &verboseFlag); parameters.addBool("noeof", &noeof); parameters.addBool("params", ¶ms); parameters.addPhoneHome(VERSION); myRecab.addRecabSpecificParameters(parameters); ParameterList inputParameters; inputParameters.Add(new LongParameters ("Input Parameters", parameters.getLongParameterList())); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(inFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an input file" << std::endl; return EXIT_FAILURE; } if(outFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an output file" << std::endl; return EXIT_FAILURE; } intExcludeFlags = excludeFlags.AsInteger(); if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Cannot specify --force and Duplicate in the excludeFlags. Since --force indicates to override" << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n"; return EXIT_FAILURE; } if(!SamFlag::isSecondary(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n"; return EXIT_FAILURE; } if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n"; return EXIT_FAILURE; } if(logFile.IsEmpty()) { logFile = outFile + ".log"; } if(myDoRecab) { int status = myRecab.processRecabParam(); if(status != 0) { inputParameters.Status(); return(status); } } if(params) { inputParameters.Status(); } Logger::gLogger = new Logger(logFile.c_str(), verboseFlag); /* ------------------------------------------------------------------- * The arguments are processed. Prepare the input BAM file, * instantiate dedup_LowMem, and construct the read group library map * ------------------------------------------------------------------*/ SamFile samIn; samIn.OpenForRead(inFile.c_str()); // If the file isn't sorted it will throw an exception. samIn.setSortedValidation(SamFile::COORDINATE); SamFileHeader header; samIn.ReadHeader(header); buildReadGroupLibraryMap(header); lastReference = -1; lastCoordinate = -1; // for keeping some basic statistics uint32_t recordCount = 0; uint32_t pairedCount = 0; uint32_t properPairCount = 0; uint32_t unmappedCount = 0; uint32_t reverseCount = 0; uint32_t qualCheckFailCount = 0; uint32_t secondaryCount = 0; uint32_t supplementaryCount = 0; uint32_t excludedCount = 0; // Now we start reading records SamRecord* recordPtr; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = mySamPool.getRecord(); if(recordPtr == NULL) { std::cerr << "Failed to allocate enough records\n"; return(-1); } if(!samIn.ReadRecord(header, *recordPtr)) { returnStatus = samIn.GetStatus(); continue; } // Take note of properties of this record int flag = recordPtr->getFlag(); if(SamFlag::isPaired(flag)) ++pairedCount; if(SamFlag::isProperPair(flag)) ++properPairCount; if(SamFlag::isReverse(flag)) ++reverseCount; if(SamFlag::isQCFailure(flag)) ++qualCheckFailCount; if(SamFlag::isSecondary(flag)) ++secondaryCount; if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT) ++supplementaryCount; if(!SamFlag::isMapped(flag)) ++unmappedCount; // put the record in the appropriate maps: // single reads go in myFragmentMap // paired reads go in myPairedMap recordCount = samIn.GetCurrentRecordCount(); // if we have moved to a new position, look back at previous reads for duplicates if (hasPositionChanged(*recordPtr)) { cleanupPriorReads(recordPtr); } // Determine if this read should be checked for duplicates. if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0)) { ++excludedCount; // No deduping done on this record, but still build the recab table. if(myDoRecab) { myRecab.processReadBuildTable(*recordPtr); } // Nothing more to do with this record, so // release the pointer. mySamPool.releaseRecord(recordPtr); } else { if(SamFlag::isDuplicate(flag) && !myForceFlag) { // Error: Marked duplicates, and duplicates aren't excluded. Logger::gLogger->error("There are records already duplicate marked."); Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over"); } checkDups(*recordPtr, recordCount); mySamPool.releaseRecord(recordPtr); } // let the user know we're not napping if (verboseFlag && (recordCount % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u", recordCount, myFragmentMap.size(), myPairedMap.size(), myMateMap.size()); } } // we're finished reading record so clean up the duplicate search and // close the input file cleanupPriorReads(NULL); samIn.Close(); // print some statistics Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS"); Logger::gLogger->writeLog("Total number of reads: %u",recordCount); Logger::gLogger->writeLog("Total number of paired-end reads: %u", pairedCount); Logger::gLogger->writeLog("Total number of properly paired reads: %u", properPairCount); Logger::gLogger->writeLog("Total number of unmapped reads: %u", unmappedCount); Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u", reverseCount); Logger::gLogger->writeLog("Total number of QC-failed reads: %u", qualCheckFailCount); Logger::gLogger->writeLog("Total number of secondary reads: %u", secondaryCount); Logger::gLogger->writeLog("Total number of supplementary reads: %u", supplementaryCount); Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u", myFragmentMap.size()); Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u", myPairedMap.size()); Logger::gLogger->writeLog("Total number of missing mates: %u", myNumMissingMate); Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u", excludedCount); Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("Sorting the indices of %d duplicated records", myDupList.size()); // sort the indices of duplicate records std::sort(myDupList.begin(), myDupList.end(), std::less<uint32_t> ()); // get ready to write the output file by making a second pass // through the input file samIn.OpenForRead(inFile.c_str()); samIn.ReadHeader(header); SamFile samOut; samOut.OpenForWrite(outFile.c_str()); samOut.WriteHeader(header); // If we are recalibrating, output the model information. if(myDoRecab) { myRecab.modelFitPrediction(outFile); } // an iterator to run through the duplicate indices int currentDupIndex = 0; bool moreDups = !myDupList.empty(); // let the user know what we're doing Logger::gLogger->writeLog("\nWriting %s", outFile.c_str()); // count the duplicate records as a check uint32_t singleDuplicates(0), pairedDuplicates(0); // start reading records and writing them out SamRecord record; while(samIn.ReadRecord(header, record)) { uint32_t currentIndex = samIn.GetCurrentRecordCount(); bool foundDup = moreDups && (currentIndex == myDupList[currentDupIndex]); // modify the duplicate flag and write out the record, // if it's appropriate int flag = record.getFlag(); if (foundDup) { // this record is a duplicate, so mark it. record.setFlag( flag | 0x400 ); currentDupIndex++; // increment duplicate counters to verify we found them all if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) ) { // unpaired or mate unmapped singleDuplicates++; } else { pairedDuplicates++; } // recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } // write the record if we are not removing duplicates if (!removeFlag ) samOut.WriteRecord(header, record); } else { if(myForceFlag) { // this is not a duplicate we've identified but we want to // remove any duplicate marking record.setFlag( flag & 0xfffffbff ); // unmark duplicate } // Not a duplicate, so recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } samOut.WriteRecord(header, record); } // Let the user know we're still here if (verboseFlag && (currentIndex % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u", currentIndex); } } // We're done. Close the files and print triumphant messages. samIn.Close(); samOut.Close(); Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads", removeFlag ? "removed" : "marked" , singleDuplicates, pairedDuplicates/2); Logger::gLogger->writeLog("\nDedup_LowMem complete!"); return 0; }