std::string getQuickStats(const std::string &bamFile, std::map< std::string, int > &keyLen, unsigned int &nFlowFZ, unsigned int &nFlowZM) { std::string errMsg = ""; BamTools::BamReader bamReader; if(!bamReader.Open(bamFile)) { errMsg += "Failed to open bam " + bamFile + "\n"; return(errMsg); } BamTools::SamHeader samHeader = bamReader.GetHeader(); for (BamTools::SamReadGroupIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr ) { if(itr->HasID()) keyLen[itr->ID] = itr->HasKeySequence() ? itr->KeySequence.length() : 0; if(itr->HasFlowOrder()) nFlowZM = std::max(nFlowZM,(unsigned int) itr->FlowOrder.length()); } BamTools::BamAlignment alignment; std::vector<uint16_t> flowIntFZ; while(bamReader.GetNextAlignment(alignment)) { if(alignment.GetTag("FZ", flowIntFZ)) nFlowFZ = flowIntFZ.size(); break; } bamReader.Close(); if(nFlowFZ==0) std::cout << "NOTE: bam file has no flow signals in FZ tag: " + bamFile + "\n"; if(nFlowZM==0) std::cout << "NOTE: bam file has no flow signals in ZM tag: " + bamFile + "\n"; return(errMsg); }
bool GetBamTags(BamTools::BamAlignment &alignment, const int &num_flows, vector<float> &measurements, vector<float> &phase_params, int &start_flow) { vector<int16_t> quantized_measurements; // Retrieve normalized measurements from BAM file if (not alignment.GetTag("ZM", quantized_measurements)) { cerr << "ERROR: Normalized measurements ZM:tag is not present in read " << alignment.Name << endl; return false; } if ((int)quantized_measurements.size() > num_flows) { cerr << "ERROR: Normalized measurements ZM:tag length exceeds flow order length in read " << alignment.Name << endl; return false; } measurements.assign(quantized_measurements.size(), 0.0); for (size_t counter = 0; counter < quantized_measurements.size(); ++counter) measurements.at(counter) = (float)quantized_measurements.at(counter)/256; // Retrieve phasing parameters from BAM file if (not alignment.GetTag("ZP", phase_params)) { cerr << "ERROR: Phasing Parameters ZP:tag is not present in read " << alignment.Name << endl; return false; } if (phase_params.size() != 3) { cerr << "ERROR: Phasing Parameters ZP:tag does not have 3 phase parameters in read " << alignment.Name << endl; return false; } if (phase_params[0] < 0 or phase_params[0] > 1 or phase_params[1] < 0 or phase_params[1] > 1 or phase_params[2] < 0 or phase_params[2] > 1) { cerr << "ERROR: Phasing Parameters ZP:tag outside of [0,1] range in read " << alignment.Name << endl; return false; } phase_params[2] = 0.0f; // ad-hoc corrector: zero droop // Retrieve start flow if (not alignment.GetTag("ZF", start_flow)) { cerr << "ERROR: Start Flow ZF:tag not found in read " << alignment.Name << endl; return false; } if (start_flow < 0 or start_flow >= num_flows) { cerr << "ERROR: Start flow outsize of [0,num_flows) range in read " << alignment.Name << endl; cerr << "Start flow: " << start_flow << " Number of flows: " << num_flows; return false; } // A start flow of zero indicated a read that did not pass basecaller filters if (start_flow == 0) { cerr << "WARNING: Start Flow ZF:tag has zero value in read " << alignment.Name << endl; return false; } return true; }
int SNPBamProcessor::get_haplotype(BamTools::BamAlignment& aln){ if (!aln.HasTag(HAPLOTYPE_TAG)) return -1; uint8_t haplotype; if (!aln.GetTag(HAPLOTYPE_TAG, haplotype)){ char type; aln.GetTagType(HAPLOTYPE_TAG, type); printErrorAndDie("Failed to extract haplotype tag"); } assert(haplotype == 1 || haplotype == 2); return (int)haplotype; }
std::string get_library(BamTools::BamAlignment& aln, std::map<std::string, std::string>& rg_to_library){ std::string rg; std::string rg_tag = "RG"; char tag_type = 'Z'; if (!aln.GetTagType(rg_tag, tag_type)) printErrorAndDie("Failed to retrieve BAM alignment's RG tag"); aln.GetTag("RG", rg); auto iter = rg_to_library.find(rg); if (iter == rg_to_library.end()) printErrorAndDie("No library found for read group " + rg + " in BAM file headers"); return iter->second; }
bool getNextAlignment(BamTools::BamAlignment &alignment, BamTools::BamReader &bamReader, const std::map<std::string, int> &groupID, std::vector< BamTools::BamAlignment > &alignmentSample, std::map<std::string, int> &wellIndex, unsigned int nSample) { if(nSample > 0) { // We are randomly sampling, so next read should come from the sample that was already taken from the bam file if(alignmentSample.size() > 0) { alignment = alignmentSample.back(); alignmentSample.pop_back(); alignment.BuildCharData(); return(true); } else { return(false); } } else { // No random sampling, so we're either returning everything or we're looking for specific read names bool storeRead = false; while(bamReader.GetNextAlignment(alignment)) { if(groupID.size() > 0) { std::string thisReadGroupID = ""; if( !alignment.GetTag("RG", thisReadGroupID) || (groupID.find(thisReadGroupID)==groupID.end()) ); continue; } storeRead=true; if(wellIndex.size() > 0) { // We are filtering by position, so check if we should skip or keep the read int thisCol,thisRow; if(1 != ion_readname_to_rowcol(alignment.Name.c_str(), &thisRow, &thisCol)) std::cerr << "Error parsing read name: " << alignment.Name << "\n"; std::stringstream wellIdStream; wellIdStream << thisCol << ":" << thisRow; std::map<std::string, int>::iterator wellIndexIter; wellIndexIter = wellIndex.find(wellIdStream.str()); if(wellIndexIter != wellIndex.end()) { // If the read ID matches we should keep, unless its a duplicate if(wellIndexIter->second >= 0) { storeRead=true; wellIndexIter->second=-1; } else { storeRead=false; std::cerr << "WARNING: found extra instance of readID " << wellIdStream.str() << ", keeping only first\n"; } } else { // read ID is not one we should keep storeRead=false; } } if(storeRead) break; } return(storeRead); } }
void RegionCoverage::TrackReadsOnRegion( const BamTools::BamAlignment &aread, uint32_t endPos ) { // track total and on-target reads uint32_t readEnd = endPos ? endPos : aread.GetEndPosition(); uint32_t covType = ReadOnRegion( aread.RefID, aread.Position + 1, readEnd ); TargetContig *contig = m_contigList[m_rcovContigIdx]; if( aread.IsReverseStrand() ) { ++contig->fwdReads; if( covType & 1 ) ++contig->fwdTrgReads; } else { ++contig->revReads; if( covType & 1 ) ++contig->revTrgReads; } }
bool ReadContainer::GetFloatBamTag(const BamTools::BamAlignment& aln, const std::string& tag_name, float* destination) { if (!aln.GetTag(tag_name, *destination)) { return false; } return true; }
// Calculate the error rate between the read and the reference double getErrorRate(BamTools::BamAlignment& record) { int nm = 0; bool hasNM = record.GetTag("NM", nm); if(hasNM) return (double)nm / record.Length; else return 0.0f; }
bool MolecularTagTrimmer::GetTagsFromBamAlignment(const BamTools::BamAlignment& alignment, MolTag& Tags) { // Don't bother if there is nothing to look at if (num_read_groups_with_tags_ == 0){ Tags.Clear(); return true; } // Load Tags from Bam Alignment if (not alignment.GetTag("ZT", Tags.prefix_mol_tag)) Tags.prefix_mol_tag.clear(); if (not alignment.GetTag("YT", Tags.suffix_mol_tag)) Tags.suffix_mol_tag.clear(); // Check if this read should have tags associated with it string read_group_name; if (not alignment.GetTag("RG",read_group_name)) return false; std::map<string,int>::const_iterator idx_it = read_group_name_to_index_.find(read_group_name); if (idx_it == read_group_name_to_index_.end()) return false; if (NeedPrefixTag(idx_it->second)) { if (Tags.prefix_mol_tag.empty()) return false; } else Tags.prefix_mol_tag.clear(); if (NeedSuffixTag(idx_it->second)) { if (Tags.suffix_mol_tag.empty()) return false; } else Tags.suffix_mol_tag.clear(); // We don't allow the joint analysis of tagged and untagged samples at the same time if (not Tags.HasTags()) return false; return true; }
uint32_t BamAlignmentReader::GetReadLength(const std::string& bamPath) { uint32_t bamReadLength = 300; BamTools::BamReader bamReader; if (!bamReader.Open(bamPath)) { throw "Unable to open bam file"; } BamTools::BamAlignment bamAlignment; while(bamReader.GetNextAlignment(bamAlignment)) { if (bamAlignment.IsPrimaryAlignment()) { bamReadLength = bamAlignment.QueryBases.size(); break; } } bamReader.Close(); return bamReadLength; }
// Read an alignment pair from the BamReader. // Returns false if the read fails bool readAlignmentPair(BamTools::BamReader* pReader, BamTools::BamAlignment& record1, BamTools::BamAlignment& record2) { // Read a pair from the BAM // Read record 1. Skip secondary alignments of the previous pair do { if(!pReader->GetNextAlignment(record1)) return false; } while(!record1.IsPrimaryAlignment()); // Read record 2. do { if(!pReader->GetNextAlignment(record2)) return false; } while(!record2.IsPrimaryAlignment()); return true; }
std::vector< IAlignment::SharedPtr > BamAlignmentReader::loadAlignmentsInRegion(Region::SharedPtr regionPtr, SampleManager::SharedPtr sampleManagerPtr, bool excludeDuplicateReads) { if (!m_is_open) { std::cout << "Bam file not opened" << std::endl; exit(0); } std::vector< IAlignment::SharedPtr > alignmentPtrs; int refID = this->m_bam_reader->GetReferenceID(regionPtr->getReferenceID()); // add 1 to the start and end positions because this is 0 based this->m_bam_reader->SetRegion(refID, regionPtr->getStartPosition(), refID, regionPtr->getEndPosition()); // std::cout << "BamAlignmentReader.cpp refID: " << refID << std::endl; BamTools::BamAlignment bamAlignment; while(this->m_bam_reader->GetNextAlignment(bamAlignment)) { if (bamAlignment.IsDuplicate() && excludeDuplicateReads) { continue; } std::string sampleName; bamAlignment.GetTag("RG", sampleName); Sample::SharedPtr samplePtr = sampleManagerPtr->getSamplePtr(sampleName); if (samplePtr == nullptr) { throw "There was an error in the sample name for: " + sampleName; } alignmentPtrs.push_back(std::make_shared< BamAlignment >(bamAlignment, samplePtr)); } // std::this_thread::sleep_for(std::chrono::milliseconds(10000)); if (m_alignment_reader_manager_ptr != nullptr) { m_alignment_reader_manager_ptr->checkinReader(this->shared_from_this()); } // std::cout << "got reads: " << regionPtr->getRegionString() << " " << alignmentPtrs.size() << std::endl; return alignmentPtrs; }
void bamParser::insertRead(const BamTools::BamAlignment& read, Reads& reads, string& chr) { int32_t loc = read.Position; bool dir; dir = (read.IsReverseStrand() ? false : true); if (loc > 0) { uint32_t tmp = (uint32_t) loc; if (dir) { reads.pos_reads.insertRead(chr, tmp); } else { reads.neg_reads.insertRead(chr, tmp); } } }
bool ReadContainer::GetIntBamTag(const BamTools::BamAlignment& aln, const std::string& tag_name, int* destination) { char tag_type; if (!aln.GetTagType(tag_name, tag_type)) {return false;} switch (tag_type) { case (BamTools::Constants::BAM_TAG_TYPE_INT32): return aln.GetTag(tag_name, *destination); case (BamTools::Constants::BAM_TAG_TYPE_INT8): int8_t d8; if (!aln.GetTag(tag_name, d8)) { return false; } *destination = static_cast<int>(d8); return true; case (BamTools::Constants::BAM_TAG_TYPE_UINT8): uint8_t ud8; if (!aln.GetTag(tag_name, ud8)) { return false; } *destination = static_cast<int>(ud8); return true; case (BamTools::Constants::BAM_TAG_TYPE_INT16): int16_t d16; if (!aln.GetTag(tag_name, d16)) { return false; } *destination = static_cast<int>(d16); return true; case (BamTools::Constants::BAM_TAG_TYPE_UINT16): uint16_t ud16; if (!aln.GetTag(tag_name, ud16)) { return false; } *destination = static_cast<int>(ud16); return true; case (BamTools::Constants::BAM_TAG_TYPE_UINT32): uint32_t ud32; if (!aln.GetTag(tag_name, ud32)) { return false; } *destination = static_cast<int>(ud32); return true; default: stringstream msg; msg << "Encountered unsupported tag type " << tag_type; PrintMessageDieOnError(msg.str(), ERROR); } return false; }
bool getTagParanoid(BamTools::BamAlignment &alignment, const std::string &tag, int64_t &value) { char tagType = ' '; if(alignment.GetTagType(tag, tagType)) { switch(tagType) { case BamTools::Constants::BAM_TAG_TYPE_INT8: { int8_t value_int8 = 0; alignment.GetTag(tag, value_int8); value = value_int8; } break; case BamTools::Constants::BAM_TAG_TYPE_UINT8: { uint8_t value_uint8 = 0; alignment.GetTag(tag, value_uint8); value = value_uint8; } break; case BamTools::Constants::BAM_TAG_TYPE_INT16: { int16_t value_int16 = 0; alignment.GetTag(tag, value_int16); value = value_int16; } break; case BamTools::Constants::BAM_TAG_TYPE_UINT16: { uint16_t value_uint16 = 0; alignment.GetTag(tag, value_uint16); value = value_uint16; } break; case BamTools::Constants::BAM_TAG_TYPE_INT32: { int32_t value_int32 = 0; alignment.GetTag(tag, value_int32); value = value_int32; } break; case BamTools::Constants::BAM_TAG_TYPE_UINT32: { uint32_t value_uint32 = 0; alignment.GetTag(tag, value_uint32); value = value_uint32; } break; default: { alignment.GetTag(tag, value); } break; } return(true); } else { return(false); } }
void Config::InitializationClustering() { struct stat st; if(stat(Workspace.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Workspace directory already present"); else if (mkdir(Workspace.c_str(), 0755) != 0) { Log("[Error] Could not create workspace directory: " + Workspace); exit(1); } RunningTasksFile = Workspace + "/" + FilePrefix + "running.tasks"; StatsFile = Workspace + "/" + FilePrefix + "stats"; BinClusterFile = Workspace + "/" + FilePrefix + "bpc"; clusterFile = new ClusterFile(BinClusterFile); clusterDir = Workspace + "/clusters/"; if(stat(clusterDir.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Cluster directory already present"); else if (mkdir(clusterDir.c_str(), 0755) != 0) { Log("[Error] Could not create cluster directory: " + clusterDir); exit(1); } insertsizeDir = Workspace + "/insertsize/"; if(stat(insertsizeDir.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Insertsize directory already present"); else if (mkdir(insertsizeDir.c_str(), 0755) != 0) { Log("[Error] Could not create insertsize directory: " + insertsizeDir); exit(1); } coverageDir = Workspace + "/coverage/"; if(stat(coverageDir.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Coverage directory already present"); else if (mkdir(coverageDir.c_str(), 0755) != 0) { Log("[Error] Could not create coverage directory: " + coverageDir); exit(1); } if (!ForwardBam.empty() && !ReverseBam.empty() && PairedBam.empty()) { UsePairedBam = false; } else if (ForwardBam.empty() && ReverseBam.empty() && !PairedBam.empty()) { UsePairedBam = true; } else { Log("[Error] No correct bam file(s)"); exit(1); } BamTools::BamAlignment alignment; BamTools::BamReader BamReader; if (UsePairedBam) { BamReader.Open(PairedBam); if (not BamReader.IsOpen()) { Log("[Error] Could not open paired bam"); exit(1); } if (PairedIndex.empty()) { if (not BamReader.LocateIndex(BamTools::BamIndex::STANDARD)) { PairedIndex = PairedBam.substr(0,PairedBam.find_last_of(".bam")-3) + ".bai"; BamReader.OpenIndex(PairedIndex); } if (not BamReader.HasIndex()) { Log("[Error] No index for bamfile"); exit(1); } } BamTools::SamHeader header = BamReader.GetHeader(); for (BamTools::SamReadGroupIterator it = header.ReadGroups.Begin(); it != header.ReadGroups.End(); it++) { BamTools::SamReadGroup* readgroup = &*it; readNameConverter.TrimName(readgroup->ID); readNameConverter.AddReadGroup(readgroup->ID); } long int count = 0; while (BamReader.GetNextAlignment(alignment)) { string RG; if (alignment.GetTag("RG", RG)) { if (not NameTrim.empty()) readNameConverter.TrimName(RG); if (readNameConverter.AddReadGroup(RG)) { Log("[Warning] Readgroup '" + RG + "' found in reads but not in header"); count = 0; } } count++; if (count > 10000) break; } BamReader.Close(); } else { BamReader.Open(ForwardBam); if (not BamReader.IsOpen()) { Log("[Error] Could not open first/forward bam"); exit(1); } if (ForwardIndex.empty()) { if (not BamReader.LocateIndex(BamTools::BamIndex::STANDARD)) { ForwardIndex = ForwardBam.substr(0,ForwardBam.find_last_of(".bam")-3) + ".bai"; BamReader.OpenIndex(ForwardIndex); } if (not BamReader.HasIndex()) { Log("[Error] No index for forward bamfile"); exit(1); } } BamTools::SamHeader forwardheader = BamReader.GetHeader(); for (BamTools::SamReadGroupIterator it = forwardheader.ReadGroups.Begin(); it != forwardheader.ReadGroups.End(); it++) { BamTools::SamReadGroup* readgroup = &*it; readNameConverter.TrimName(readgroup->ID); readNameConverter.AddReadGroup(readgroup->ID); } long int count = 0; while (BamReader.GetNextAlignment(alignment)) { string RG; if (alignment.GetTag("RG", RG)) { if (!NameTrim.empty()) readNameConverter.TrimName(RG); if (readNameConverter.AddReadGroup(RG)) { Log("[Warning] Readgroup '" + RG + "' found in forward reads but not in header"); count = 0; } } count++; if (count > 10000) break; } BamReader.Close(); BamReader.Open(ReverseBam); if (not BamReader.IsOpen()) { Log("[Error] Could not open second/reverse bam"); exit(1); } if (ReverseIndex.empty()) { if (not BamReader.LocateIndex(BamTools::BamIndex::STANDARD)) { ReverseIndex = ReverseBam.substr(0,ReverseBam.find_last_of(".bam")-3) + ".bai"; BamReader.OpenIndex(ReverseIndex); } if (not BamReader.HasIndex()) { Log("[Error] No index for reverse bamfile"); exit(1); } } BamTools::SamHeader reverseheader = BamReader.GetHeader(); for (BamTools::SamReadGroupIterator it = reverseheader.ReadGroups.Begin(); it != reverseheader.ReadGroups.End(); it++) { BamTools::SamReadGroup* readgroup = &*it; readNameConverter.TrimName(readgroup->ID); if (readNameConverter.AddReadGroup(readgroup->ID)) { Log("[Warning] Readgroup '" + readgroup->ID + "' found in reverse but not in forward"); } } count = 0; while (BamReader.GetNextAlignment(alignment)) { string RG; if (alignment.GetTag("RG", RG)) { if (!NameTrim.empty()) readNameConverter.TrimName(RG); if (readNameConverter.AddReadGroup(RG)) { Log("[Warning] Readgroup '" + RG + "' found in reverse reads but not in header"); count = 0; } } count++; if (count > 10000) break; } BamReader.Close(); } for(map<string, int>::iterator it = readNameConverter.ReadGroups.begin(); it!=readNameConverter.ReadGroups.end(); ++it) { ostringstream logBuffer; logBuffer << "Readgroup found: " << it->second << " - " << it->first; Log(logBuffer.str()); } writeConfigFile(Workspace + FilePrefix + "config"); }
CoverageStats getVariantCoverage(BamTools::BamReader* pReader, const VCFRecord& record, const ReadTable* refTable) { CoverageStats stats; static const int flankingSize = 100; static const double minPercentIdentity = 95.0f; bool is_snv = record.refStr.size() == 1 && record.varStr.size() == 1; // Grab the reference haplotype int eventLength = record.varStr.length(); int zeroBasedPos = record.refPosition - 1; int start = zeroBasedPos - flankingSize - 1; if(start < 0) start = 0; int end = zeroBasedPos + eventLength + 2 * flankingSize; const SeqItem& chr = refTable->getRead(record.refName); if(end > (int)chr.seq.length()) end = (int)chr.seq.length(); std::string reference_haplotype = chr.seq.substr(start, end - start); int translatedPos = zeroBasedPos - start; std::string variant_haplotype = reference_haplotype; // Ensure that the reference string at the variant matches the expected assert(variant_haplotype.substr(translatedPos, record.refStr.length()) == record.refStr); variant_haplotype.replace(translatedPos, record.refStr.length(), record.varStr); // Grab all reads in reference region int refID = pReader->GetReferenceID(record.refName); if(refID < 0) return stats; int refStart = record.refPosition; int refEnd = record.refPosition; pReader->SetRegion(refID, refStart, refID, refEnd); BamTools::BamAlignment aln; std::vector<double> mapping_quality; std::vector<BamTools::BamAlignment> alignments; while(pReader->GetNextAlignment(aln)) { if(aln.MapQuality > 0) alignments.push_back(aln); mapping_quality.push_back(aln.MapQuality); } if(!mapping_quality.empty()) stats.median_mapping_quality = median(mapping_quality); else stats.median_mapping_quality = 60; // Shuffle and take the first 200 alignments only std::random_shuffle(alignments.begin(), alignments.end()); for(size_t i = 0; i < alignments.size() && i < opt::capAlignments; ++i) { BamTools::BamAlignment alignment = alignments[i]; VariantReadSegments segments = splitReadAtVariant(alignment, record); if(opt::verbose > 1) { fprintf(stderr, "var: %zu %s -> %s\n", record.refPosition, record.refStr.c_str(), record.varStr.c_str()); fprintf(stderr, "pos: %d\n", alignment.Position); fprintf(stderr, "strand: %s\n", alignment.IsReverseStrand() ? "-" : "+"); fprintf(stderr, "read: %s\n", alignment.QueryBases.c_str()); fprintf(stderr, "qual: %s\n", alignment.Qualities.c_str()); fprintf(stderr, "alnb: %s\n", alignment.AlignedBases.c_str()); fprintf(stderr, "Pre: %s\n", segments.preSegment.c_str()); fprintf(stderr, "Var: %s\n", segments.variantSegment.c_str()); fprintf(stderr, "Pos: %s\n", segments.postSegment.c_str()); fprintf(stderr, "PreQual: %s\n", segments.preQual.c_str()); fprintf(stderr, "VarQual: %s\n", segments.variantQual.c_str()); fprintf(stderr, "PosQual: %s\n", segments.postQual.c_str()); } bool aligned_at_variant = segments.variantSegment.size() > 0 && (segments.preSegment.size() > 0 || segments.postSegment.size() > 0); if(!aligned_at_variant) continue; stats.n_total_reads += 1; if(segments.variantSegment == record.refStr) continue; // not an evidence read // Align the read to the reference and variant haplotype SequenceOverlap ref_overlap = Overlapper::computeOverlapAffine(alignment.QueryBases, reference_haplotype); SequenceOverlap var_overlap = Overlapper::computeOverlapAffine(alignment.QueryBases, variant_haplotype); bool quality_alignment = (ref_overlap.getPercentIdentity() >= minPercentIdentity || var_overlap.getPercentIdentity() >= minPercentIdentity); bool is_evidence_read = quality_alignment && var_overlap.score > ref_overlap.score; if(is_evidence_read) { stats.n_evidence_reads += 1; if(is_snv && segments.variantQual.size() == 1) { char qb = segments.variantQual[0]; int q = Quality::char2phred(qb); stats.snv_evidence_quals.push_back(q); } } } return stats; }
void AmpliconRegionStatistics::TrackReadsOnRegion( const BamTools::BamAlignment &aread, uint32_t endPos ) { // pseudo-random number generator 'seed' for resolving equivalent read assignments static uint16_t clockSeed = 0; // check/set first region read overlaps uint32_t readSrt = aread.Position + 1; uint32_t readEnd = endPos ? endPos : aread.GetEndPosition(); uint32_t covType = ReadOnRegion( aread.RefID, readSrt, readEnd ); // maintain base method of tracking total reads TargetContig *contig = m_contigList[m_rcovContigIdx]; bool isRev = aread.IsReverseStrand(); if( isRev ) { ++contig->revReads; } else { ++contig->fwdReads; } // Tracking of reads on target if( covType & 1 ) { // iterate over all regions overlapping read... int32_t bestEndDist = -m_maxUpstreamPrimerStart; int32_t bestOverlap = 0; uint32_t numBestRegions = 0; bool haveBestEnd = false; for( TargetRegion *cur = m_rcovRegion; cur; cur = cur->next ) { if( readEnd < cur->trgSrt ) break; if( readSrt > m_rcovRegion->trgEnd ) continue; // save stats for all overlapped reads ++(GetStats(cur)->overlaps); // find most likely AmpliSeq primed region of those overlapped // NOTE: can still be wrong for regions starting very close together, given 5' digestion uncertainty, // coupled with read length and digestion uncertainty at 3' int32_t dSrt = readSrt - cur->trgSrt; int32_t dEnd = cur->trgEnd - readEnd; int32_t endDist5p = isRev ? dEnd : dSrt; // for non-amplicon reads, ends are ignored and only maximum overlap is employed to distinguish target region if( m_ampliconReads ) { // always select region that is closest start before 5p primer if( endDist5p < 0 && endDist5p > bestEndDist ) { haveBestEnd = true; bestEndDist = endDist5p; bestOverlap = 0; // force record best below } else if( haveBestEnd && endDist5p != bestEndDist ) { // region is not closer primed or same distance from false priming site continue; } } // save region based on max overlap for equivalent regions if( dSrt < 0 ) dSrt = 0; if( dEnd < 0 ) dEnd = 0; int32_t overlap = cur->trgEnd - cur->trgSrt - dSrt - dEnd; // +1 if( overlap >= bestOverlap ) { // if overlaps also match then default to region starting most 3' // - cannot do better w/o knowing exact priming location, or possibly using ZA tag value if( overlap == bestOverlap ) { // stack multiple equivalent solutions if( numBestRegions >= m_regionStackSize ) { // safety code - only triggered if many targets overlapping read m_regionStackSize <<= 1; // *2 m_regionStack = (TargetRegion **)realloc( m_regionStack, m_regionStackSize * sizeof(TargetRegion *) ); } } else { // save new best solution - these values are the same for all equivalent solutions bestOverlap = overlap; numBestRegions = 0; } m_regionStack[numBestRegions++] = cur; } } // pseudo-randomly choose best region of equivalent best regions TargetRegion *bestRegion = m_regionStack[ clockSeed % numBestRegions ]; bool e2e_or_cov; if( m_sigFacCoverage ) { int32_t trgLen = bestRegion->trgEnd - bestRegion->trgSrt + 1; e2e_or_cov = (double(bestOverlap+1)/trgLen >= m_sigFacCoverage); } else { int32_t dSrt = readSrt - bestRegion->trgSrt; int32_t dEnd = bestRegion->trgEnd - readEnd; if( dSrt < 0 ) dSrt = 0; if( dEnd < 0 ) dEnd = 0; e2e_or_cov = ((dSrt > dEnd ? dSrt : dEnd) <= m_maxE2eEndDist); } StatsData *stats = GetStats(bestRegion); if( isRev ) { ++contig->revTrgReads; ++stats->revReads; if( e2e_or_cov ) ++stats->rev_e2e; } else { ++contig->fwdTrgReads; ++stats->fwdReads; if( e2e_or_cov ) ++stats->fwd_e2e; } } ++clockSeed; }
bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln, AlignedRead* aligned_read, map<pair<string,int>, string>& ref_ext_nucleotides) { // get read ID aligned_read->ID = aln.Name; // get nucleotides aligned_read->nucleotides = aln.QueryBases; // get qualities aligned_read->qualities = aln.Qualities; // get strand aligned_read->strand = aln.IsReverseStrand(); // get chrom aligned_read->chrom = references.at(aln.RefID).RefName; // get read start aligned_read->read_start = aln.Position; // get cigar aligned_read->cigar_ops = aln.CigarData; // get if mate pair if (aln.IsSecondMate()) { aligned_read->mate = 1; } else { aligned_read->mate = 0; } // Only process if it is the primary alignment if (aligned_read->mate) { return false; } // Get all the tag data // don't process if partially spanning (from old lobSTR) int partial = 0; if (GetIntBamTag(aln, "XP", &partial)) { if (partial == 1) return false; } // get read group if (!GetStringBamTag(aln, "RG", &aligned_read->read_group)) { stringstream msg; msg << aln.Name << " Could not get read group."; PrintMessageDieOnError(msg.str(), ERROR); } // get msStart if (!GetIntBamTag(aln, "XS", &aligned_read->msStart)) { stringstream msg; msg << aln.Name << " from group " << aligned_read->read_group << " Could not get STR start coordinate. Did this bam file come from lobSTR?"; PrintMessageDieOnError(msg.str(), ERROR); } // get msEnd if (!GetIntBamTag(aln, "XE", &aligned_read->msEnd)) { stringstream msg; msg << aln.Name << " from group " << aligned_read->read_group << " Could not get STR end coordinate. Did this bam file come from lobSTR?"; PrintMessageDieOnError(msg.str(), ERROR); } // get mapq. Try unsigned/signed if (!GetIntBamTag(aln, "XQ", &aligned_read->mapq)) { stringstream msg; aligned_read->mapq = 0; } // get diff if (!GetIntBamTag(aln, "XD", &aligned_read->diffFromRef)) { return false; } // get mate dist if (!GetIntBamTag(aln, "XM", &aligned_read->matedist)) { aligned_read->matedist = 0; } // get STR seq if (!GetStringBamTag(aln, "XR", &aligned_read->repseq)) { stringstream msg; msg << aln.Name << " from group " << aligned_read->read_group << " Could not get repseq."; PrintMessageDieOnError(msg.str(), ERROR); } // get if stitched if (!GetIntBamTag(aln, "XX", &aligned_read->stitched)) { aligned_read->stitched = 0; } // get ref copy num if (!GetFloatBamTag(aln, "XC", &aligned_read->refCopyNum)) { stringstream msg; msg << aln.Name << " from group " << aligned_read->read_group << " Could not get reference copy number."; PrintMessageDieOnError(msg.str(), ERROR); } // get period aligned_read->period = aligned_read->repseq.length(); if (include_flank) { // diff is just sum of differences in cigar CIGAR_LIST cigar_list; for (vector<BamTools::CigarOp>::const_iterator it = aligned_read->cigar_ops.begin(); it != aligned_read->cigar_ops.end(); it++) { CIGAR cig; cig.num = (*it).Length; cig.cigar_type = (*it).Type; cigar_list.cigars.push_back(cig); } bool added_s; bool cigar_had_s; cigar_list.ResetString(); GenerateCorrectCigar(&cigar_list, aln.QueryBases, &added_s, &cigar_had_s); aligned_read->diffFromRef = GetSTRAllele(cigar_list); } // apply filters if (unit) { if (aligned_read->diffFromRef % aligned_read->period != 0){ filter_counter.increment(FilterCounter::NOT_UNIT); return false; } } if (abs(aligned_read->diffFromRef) > max_diff_ref) { filter_counter.increment(FilterCounter::DIFF_FROM_REF); return false; } if (aligned_read->mapq > max_mapq) { filter_counter.increment(FilterCounter::MAPPING_QUALITY); return false; } if (aligned_read->matedist > max_matedist) { filter_counter.increment(FilterCounter::MATE_DIST); return false; } // Check if the allele length is valid if (aligned_read->diffFromRef + (aligned_read->refCopyNum*aligned_read->period) < MIN_ALLELE_SIZE) { filter_counter.increment(FilterCounter::ALLELE_SIZE); return false; } // check that read sufficiently spans STR int max_read_start = aligned_read->msStart - min_border; int min_read_stop = aligned_read->msEnd + min_border; if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop){ filter_counter.increment(FilterCounter::SPANNING_AMOUNT); return false; } // check that both ends of the read contain sufficient perfect matches if (min_read_end_match > 0){ map<pair<string,int>, string>::iterator loc_iter = ref_ext_nucleotides.find(pair<string,int>(aligned_read->chrom, aligned_read->msStart)); if (loc_iter == ref_ext_nucleotides.end()) PrintMessageDieOnError("No extended reference sequence found for locus", ERROR); string ref_ext_seq = loc_iter->second; pair<int,int> num_end_matches = AlignmentFilters::GetNumEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend); if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match){ filter_counter.increment(FilterCounter::NUM_END_MATCHES); return false; } } // check that the prefix and suffix of the read match maximally compared to proximal reference locations if (maximal_end_match_window > 0){ map<pair<string,int>, string>::iterator loc_iter = ref_ext_nucleotides.find(pair<string,int>(aligned_read->chrom, aligned_read->msStart)); if (loc_iter == ref_ext_nucleotides.end()) PrintMessageDieOnError("No extended reference sequence found for locus", ERROR); string ref_ext_seq = loc_iter->second; bool maximum_end_matches = AlignmentFilters::HasLargestEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend, maximal_end_match_window, maximal_end_match_window); if (!maximum_end_matches){ filter_counter.increment(FilterCounter::NOT_MAXIMAL_END); return false; } } // check that both ends of the aligned read have sufficient bases before the first indel if (min_bp_before_indel > 0){ pair<int, int> num_bps = AlignmentFilters::GetEndDistToIndel(aligned_read); if (num_bps.first != -1 && num_bps.first < min_bp_before_indel){ filter_counter.increment(FilterCounter::BP_BEFORE_INDEL); return false; } if (num_bps.second != -1 && num_bps.second < min_bp_before_indel){ filter_counter.increment(FilterCounter::BP_BEFORE_INDEL); return false; } } filter_counter.increment(FilterCounter::UNFILTERED); return true; }
// Returns true if the paired reads are a short-insert pair bool filterByGraph(StringGraph* pGraph, const BamTools::RefVector& referenceVector, BamTools::BamAlignment& record1, BamTools::BamAlignment& record2) { std::string vertexID1 = referenceVector[record1.RefID].RefName; std::string vertexID2 = referenceVector[record2.RefID].RefName; // Get the vertices for this pair using the mapped IDs Vertex* pX = pGraph->getVertex(vertexID1); Vertex* pY = pGraph->getVertex(vertexID2); // Ensure that the vertices are found assert(pX != NULL && pY != NULL); #ifdef DEBUG_CONNECT std::cout << "Finding path from " << vertexID1 << " to " << vertexID2 << "\n"; #endif EdgeDir walkDirectionXOut = ED_SENSE; EdgeDir walkDirectionYIn = ED_SENSE; // Flip walk directions if the alignment is to the reverse strand if(record1.IsReverseStrand()) walkDirectionXOut = !walkDirectionXOut; if(record2.IsReverseStrand()) walkDirectionYIn = !walkDirectionYIn; int fromX = walkDirectionXOut == ED_SENSE ? record1.Position : record1.GetEndPosition(); int toY = walkDirectionYIn == ED_SENSE ? record2.Position : record2.GetEndPosition(); // Calculate the amount of contig X that already covers the fragment // Using this number, we calculate how far we should search int coveredX = walkDirectionXOut == ED_SENSE ? pX->getSeqLen() - fromX : fromX; int maxWalkDistance = opt::maxDistance - coveredX; bool bShortInsertPair = false; if(pX == pY) { if(abs(record1.InsertSize) < opt::maxDistance) bShortInsertPair = true; } else { SGWalkVector walks; SGSearch::findWalks(pX, pY, walkDirectionXOut, maxWalkDistance, 10000, true, walks); if(!walks.empty()) { for(size_t i = 0; i < walks.size(); ++i) { std::string fragment = walks[i].getFragmentString(pX, pY, fromX, toY, walkDirectionXOut, walkDirectionYIn); if((int)fragment.size() < opt::maxDistance) { bShortInsertPair = true; //std::cout << "Found completing fragment (" << pX->getID() << " -> " << pY->getID() << ": " << fragment.size() << "\n"; break; } } } } return bShortInsertPair; }
void ReadContainer::AddReadsFromFile(const ReferenceSTR& ref_str) { if (ref_str.chrom != "NA") { int refid = -1; if (chrom_to_refid.find(ref_str.chrom) != chrom_to_refid.end()) { refid = chrom_to_refid.at(ref_str.chrom); } if (refid == -1) { PrintMessageDieOnError("Could not locate STR reference chromosome in bam file", ERROR); } BamTools::BamRegion bam_region(refid, ref_str.start-extend, refid, ref_str.stop+extend); if (!reader.SetRegion(bam_region)) { PrintMessageDieOnError("Could not set bam region", ERROR); } } BamTools::BamAlignment aln; while (reader.GetNextAlignment(aln)) { AlignedRead aligned_read; // get read ID aligned_read.ID = aln.Name; // get nucleotides aligned_read.nucleotides = aln.QueryBases; // get qualities aligned_read.qualities = aln.Qualities; // get strand aligned_read.strand = aln.IsReverseStrand(); // get chrom aligned_read.chrom = references.at(aln.RefID).RefName; // get read start aligned_read.read_start = aln.Position; // get cigar aligned_read.cigar_ops = aln.CigarData; // get if mate pair if (aln.IsSecondMate()) { aligned_read.mate = 1; } else { aligned_read.mate = 0; } // Only process if it is the primary alignment if (aligned_read.mate) { continue; } // Get all the tag data // don't process if partially spanning (from old lobSTR) int partial = 0; if (GetIntBamTag(aln, "XP", &partial)) { if (partial == 1) continue; } // get read group if (!GetStringBamTag(aln, "RG", &aligned_read.read_group)) { stringstream msg; msg << aln.Name << " Could not get read group."; PrintMessageDieOnError(msg.str(), ERROR); } // get msStart if (!GetIntBamTag(aln, "XS", &aligned_read.msStart)) { stringstream msg; msg << aln.Name << " from group " << aligned_read.read_group << " Could not get STR start coordinate. Did this bam file come from lobSTR?"; PrintMessageDieOnError(msg.str(), ERROR); } // get msEnd if (!GetIntBamTag(aln, "XE", &aligned_read.msEnd)) { stringstream msg; msg << aln.Name << " from group " << aligned_read.read_group << " Could not get STR end coordinate. Did this bam file come from lobSTR?"; PrintMessageDieOnError(msg.str(), ERROR); } // get mapq. Try unsigned/signed if (!GetIntBamTag(aln, "XQ", &aligned_read.mapq)) { stringstream msg; aligned_read.mapq = 0; } // get diff if (!GetIntBamTag(aln, "XD", &aligned_read.diffFromRef)) { if (aligned_read.mate == 0) { stringstream msg; msg << aln.Name << " from group " << aligned_read.read_group << " Could not get genotype."; PrintMessageDieOnError(msg.str(), ERROR); } continue; } // get mate dist if (!GetIntBamTag(aln, "XM", &aligned_read.matedist)) { aligned_read.matedist = 0; } // get STR seq if (!GetStringBamTag(aln, "XR", &aligned_read.repseq)) { stringstream msg; msg << aln.Name << " from group " << aligned_read.read_group << " Could not get repseq."; PrintMessageDieOnError(msg.str(), ERROR); } // get if stitched if (!GetIntBamTag(aln, "XX", &aligned_read.stitched)) { aligned_read.stitched = 0; } // get ref copy num if (!GetFloatBamTag(aln, "XC", &aligned_read.refCopyNum)) { stringstream msg; msg << aln.Name << " from group " << aligned_read.read_group << " Could not get reference copy number."; PrintMessageDieOnError(msg.str(), ERROR); } // get period aligned_read.period = aligned_read.repseq.length(); if (include_flank) { // diff is just sum of differences in cigar CIGAR_LIST cigar_list; for (vector<BamTools::CigarOp>::const_iterator it = aligned_read.cigar_ops.begin(); it != aligned_read.cigar_ops.end(); it++) { CIGAR cig; cig.num = (*it).Length; cig.cigar_type = (*it).Type; cigar_list.cigars.push_back(cig); } bool added_s; bool cigar_had_s; cigar_list.ResetString(); GenerateCorrectCigar(&cigar_list, aln.QueryBases, &added_s, &cigar_had_s); aligned_read.diffFromRef = GetSTRAllele(cigar_list); } // apply filters if (unit) { if (aligned_read.diffFromRef % aligned_read.period != 0) continue; } if (abs(aligned_read.diffFromRef) > max_diff_ref) { continue; } if (aligned_read.mapq > max_mapq) { continue; } if (aligned_read.matedist > max_matedist) { continue; } // Add to map pair<string, int> coord (aligned_read.chrom, aligned_read.msStart); if (aligned_str_map_.find(coord) != aligned_str_map_.end()) { aligned_str_map_.at(coord).push_back(aligned_read); } else { list<AlignedRead> aligned_read_list; aligned_read_list.push_back(aligned_read); aligned_str_map_.insert(pair< pair<string, int>, list<AlignedRead> > (coord, aligned_read_list)); } } }
// Function to fill in predicted signal values void BaseHypothesisEvaluator(BamTools::BamAlignment &alignment, const string &flow_order_str, const string &alt_base_hyp, float &delta_score, float &fit_score, int heavy_verbose) { // --- Step 1: Initialize Objects and retrieve relevant tags delta_score = 1e5; fit_score = 1e5; vector<string> Hypotheses(2); vector<float> measurements, phase_params; int start_flow, num_flows, prefix_flow=0; if (not GetBamTags(alignment, flow_order_str.length(), measurements, phase_params, start_flow)) return; num_flows = measurements.size(); ion::FlowOrder flow_order(flow_order_str, num_flows); BasecallerRead master_read; master_read.SetData(measurements, flow_order.num_flows()); TreephaserLite treephaser(flow_order); treephaser.SetModelParameters(phase_params[0], phase_params[1]); // --- Step 2: Solve beginning of the read // Look at mapped vs. unmapped reads in BAM Hypotheses[0] = alignment.QueryBases; Hypotheses[1] = alt_base_hyp; // Safety: reverse complement reverse strand reads in mapped bam if (alignment.IsMapped() and alignment.IsReverseStrand()) { RevComplementInPlace(Hypotheses[0]); RevComplementInPlace(Hypotheses[1]); } prefix_flow = GetMasterReadPrefix(treephaser, flow_order, start_flow, Hypotheses[0], master_read); unsigned int prefix_size = master_read.sequence.size(); // --- Step 3: creating predictions for the individual hypotheses vector<BasecallerRead> hypothesesReads(Hypotheses.size()); vector<float> squared_distances(Hypotheses.size(), 0.0); int max_last_flow = 0; for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) { hypothesesReads[i_hyp] = master_read; // --- add hypothesis sequence to clipped prefix unsigned int i_base = 0; int i_flow = prefix_flow; while (i_base<Hypotheses[i_hyp].length() and i_base<(2*(unsigned int)flow_order.num_flows()-prefix_size)) { while (i_flow < flow_order.num_flows() and flow_order.nuc_at(i_flow) != Hypotheses[i_hyp][i_base]) i_flow++; if (i_flow < flow_order.num_flows() and i_flow > max_last_flow) max_last_flow = i_flow; if (i_flow >= flow_order.num_flows()) break; // Add base to sequence only if it fits into flow order hypothesesReads[i_hyp].sequence.push_back(Hypotheses[i_hyp][i_base]); i_base++; } i_flow = min(i_flow, flow_order.num_flows()-1); // Solver simulates beginning of the read and then fills in the remaining clipped bases for which we have flow information treephaser.Solve(hypothesesReads[i_hyp], num_flows, i_flow); } // Compute L2-distance of measurements and predictions for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) { for (int iFlow=0; iFlow<=max_last_flow; iFlow++) squared_distances[i_hyp] += (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow)) * (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow)); } // Delta: L2-distance of alternative base Hypothesis - L2-distance of bases as called delta_score = squared_distances.at(1) - squared_distances.at(0); fit_score = min(squared_distances.at(1), squared_distances.at(0)); // --- verbose --- if (heavy_verbose > 1 or (delta_score < 0 and heavy_verbose > 0)) { cout << "Processed read " << alignment.Name << endl; cout << "Delta Fit: " << delta_score << " Overall Fit: " << fit_score << endl; PredictionGenerationVerbose(Hypotheses, hypothesesReads, phase_params, flow_order, start_flow, prefix_size); } }