static void addReadToDepthEst( const bam_record& bamRead, const pos_t beginPos, std::vector<unsigned>& depth) { using namespace ALIGNPATH; const pos_t endPos(beginPos+depth.size()); // get cigar: path_t apath; bam_cigar_to_apath(bamRead.raw_cigar(), bamRead.n_cigar(), apath); pos_t refPos(bamRead.pos()-1); BOOST_FOREACH(const path_segment& ps, apath) { if (refPos>=endPos) return; if (MATCH == ps.type) { for (pos_t pos(refPos); pos < (refPos+static_cast<pos_t>(ps.length)); ++pos) { if (pos>=beginPos) { if (pos>=endPos) return; depth[pos-beginPos]++; } } } if (is_segment_type_ref_length(ps.type)) refPos += ps.length; } }
void SVScorePairRefProcessor:: processClearedRecord( const bam_record& bamRead) { using namespace illumina::common; assert(bamParams.isSet); const pos_t refPos(bamRead.pos()-1); if (! bamParams.interval.range.is_pos_intersect(refPos)) return; const bool isLargeInsert(isLargeInsertSV(sv)); #ifdef DEBUG_MEGAPAIR log_os << __FUNCTION__ << ": read: " << bamRead << "\n"; #endif /// check if fragment is too big or too small: const int templateSize(std::abs(bamRead.template_size())); if (templateSize < bamParams.minFrag) return; if (templateSize > bamParams.maxFrag) return; // count only from the down stream reads const bool isFirstBamRead(isFirstRead(bamRead)); // get fragment range: pos_t fragBeginRefPos(refPos); if (! isFirstBamRead) { fragBeginRefPos=bamRead.mate_pos()-1; } const pos_t fragEndRefPos(fragBeginRefPos+templateSize); if (fragBeginRefPos > fragEndRefPos) { std::ostringstream oss; oss << "ERROR: Failed to parse fragment range from bam record. Frag begin,end: " << fragBeginRefPos << " " << fragEndRefPos << " bamRecord: " << bamRead << "\n"; BOOST_THROW_EXCEPTION(LogicException(oss.str())); } { const pos_t fragOverlap(std::min((1+svParams.centerPos-fragBeginRefPos), (fragEndRefPos-svParams.centerPos))); #ifdef DEBUG_MEGAPAIR log_os << __FUNCTION__ << ": frag begin/end/overlap: " << fragBeginRefPos << " " << fragEndRefPos << " " << fragOverlap << "\n"; #endif if (fragOverlap < pairOpt.minFragSupport) return; } SVFragmentEvidence& fragment(evidence.getSampleEvidence(bamParams.bamIndex)[bamRead.qname()]); static const bool isShadow(false); SVFragmentEvidenceRead& evRead(fragment.getRead(bamRead.is_first())); setReadEvidence(svParams.minMapQ, svParams.minTier2MapQ, bamRead, isShadow, evRead); setAlleleFrag(*bamParams.fragDistroPtr, templateSize, fragment.ref.getBp(isBp1),isLargeInsert); }
bool isMateInsertionEvidenceCandidate2( const bam_record& bamRead, const bool isSearchForLeftOpen, const bool isSearchForRightOpen) { if ((! isSearchForLeftOpen) && (! bamRead.is_fwd_strand())) return false; if ((! isSearchForRightOpen) && bamRead.is_fwd_strand()) return false; return true; }
void addRead( const bam_record& bamRead) { if (_isRegionInit) { if (bamRead.pos() > _endPos + 1000) { _maxPos=_endPos; setNewRegion(); } } if (! _isRegionInit) { _minPos=bamRead.pos(); _maxPos=bamRead.pos(); _endPos=bamRead.pos() + bamRead.read_size(); _isRegionInit=true; } else { if (bamRead.pos() > _maxPos) { _maxPos = bamRead.pos(); _endPos=bamRead.pos() + bamRead.read_size(); } } _count++; _totalReadLength += bamRead.read_size(); }
FragmentSizeType::index_t SVLocusScanner:: _getFragmentSizeType( const bam_record& bamRead, const unsigned defaultReadGroupIndex) const { using namespace FragmentSizeType; if (bamRead.target_id() != bamRead.mate_target_id()) return DISTANT; const int32_t fragmentSize(std::abs(bamRead.template_size())); return classifySize(_stats[defaultReadGroupIndex], fragmentSize); }
void addRead( const bam_record& bamRead) { const pos_t pos(bamRead.pos()-1); const unsigned rsize(bamRead.read_size()); if (! _isRegionInit) { _maxPos=pos; _isRegionInit=true; } for (; _maxPos<pos; ++_maxPos) flushPos(_maxPos); _depth.inc(pos,rsize); _count++; }
static void getSingleReadSVCandidates( const ReadScannerOptions& opt, const ReadScannerDerivOptions& dopt, const bam_record& localRead, const SimpleAlignment& localAlign, const chromMap_t& chromToIndex, const reference_contig_segment& refSeq, std::vector<SVObservation>& candidates) { using namespace illumina::common; const bool isRead2(localRead.is_paired() && (localRead.read_no() == 2)); const FRAGSOURCE::index_t fragSource(isRead2 ? FRAGSOURCE::READ2 : FRAGSOURCE::READ1); // - process any large indels in the localRead: getSVCandidatesFromReadIndels(opt, dopt, localAlign, fragSource, candidates); #ifdef DEBUG_SCANNER log_os << __FUNCTION__ << ": post-indels candidate_size: " << candidates.size() << "\n"; #endif // a read can provide SA split evidence or semi-aligned/soft-clip, but not both. // this prevents split reads from triggering spurious local assembles. It is // possible for a read to genuinely contain evidence of both, but this should // be very rare. if (localRead.isSASplit()) { getSACandidatesFromRead(opt, dopt, localRead, localAlign, fragSource, chromToIndex, candidates); #ifdef DEBUG_SCANNER log_os << __FUNCTION__ << ": post-split read candidate_size: " << candidates.size() << "\n"; #endif } else { if (dopt.isSmallCandidates) { getSVCandidatesFromSemiAligned(opt, dopt, localRead, localAlign, fragSource, refSeq, candidates); } #ifdef DEBUG_SCANNER log_os << __FUNCTION__ << ": post-semialigned candidate_size: " << candidates.size() << "\n"; #endif } }
void addSupplementaryAlignmentEvidence( bam_record& bamRead, const std::string& svStr) { static const char svtag[] = {'S','A'}; bam_aux_append(bamRead.get_data(),svtag,'Z',(svStr.size()+1), (const uint8_t*)(svStr.c_str())); }
/// get SV candidates from SA-tag split-read alignment static SVObservation GetSplitSACandidate( const ReadScannerDerivOptions& dopt, const bam_record& localRead, const SimpleAlignment& localAlign, const SimpleAlignment& remoteAlign, const FRAGSOURCE::index_t fragSource) { using namespace SVEvidenceType; static const index_t svSource(SPLIT_ALIGN); SVObservation sv; sv.evtype = svSource; sv.fragSource = fragSource; SVBreakend& localBreakend(sv.bp1); SVBreakend& remoteBreakend(sv.bp2); // use single-side evidence, have to read the supp read to get the // reverse edge. this protects against double-count: localBreakend.lowresEvidence.add(svSource); updateSABreakend(dopt, localAlign, localBreakend); updateSABreakend(dopt, remoteAlign, remoteBreakend); // If the local (bp1) alignment is split downstream (on the right side) then this read goes from bp1 -> bp2. // If it is a forward read (e.g. read1 on + strand), this means it's a forward read for this event. const bool isSplitDownstream(isSplitOpenDownstream(localAlign.path)); const bool isReadFw = (localRead.is_first() == localRead.is_fwd_strand()); if (dopt.isStranded) { if (isReadFw == isSplitDownstream) { sv.fwReads += 1; } else { sv.rvReads += 1; } } return sv; }
/// extract poorly aligned read ends (semi-aligned and/or soft-clipped) /// to internal candidate format static void getSVCandidatesFromSemiAligned( const ReadScannerOptions& opt, const ReadScannerDerivOptions& dopt, const bam_record& bamRead, const SimpleAlignment& bamAlign, const FRAGSOURCE::index_t fragSource, const reference_contig_segment& refSeq, std::vector<SVObservation>& candidates) { unsigned leadingMismatchLen(0); unsigned trailingMismatchLen(0); pos_t leadingRefPos(0), trailingRefPos(0); getSVBreakendCandidateSemiAligned(bamRead, bamAlign, refSeq, dopt.isUseOverlappingPairs, leadingMismatchLen, leadingRefPos, trailingMismatchLen, trailingRefPos); if ((leadingMismatchLen + trailingMismatchLen) >= bamRead.read_size()) return; using namespace SVEvidenceType; static const index_t svSource(SEMIALIGN); // semi-aligned reads don't define a full hypothesis, so they're always evidence for a 'complex' ie. undefined, event // in a fashion analogous to clipped reads static const bool isComplex(true); if (leadingMismatchLen >= opt.minSemiAlignedMismatchLen) { const pos_t pos(leadingRefPos); candidates.push_back(GetSplitSVCandidate(dopt,bamRead.target_id(),pos,pos,svSource, fragSource,isComplex)); } if (trailingMismatchLen >= opt.minSemiAlignedMismatchLen) { const pos_t pos(trailingRefPos); candidates.push_back(GetSplitSVCandidate(dopt,bamRead.target_id(),pos,pos,svSource, fragSource,isComplex)); } }
bool SVLocusScanner:: isProperPair( const bam_record& bamRead, const unsigned defaultReadGroupIndex) const { if (! is_innie_pair(bamRead)) return false; const Range& ppr(_stats[defaultReadGroupIndex].properPair); const int32_t fragmentSize(std::abs(bamRead.template_size())); // we're seeing way to much large fragment garbage in cancers to use // vanilla proper pair criteria, push the max fragment size out a bit for now: static const float maxAnomFactor(1.5); if (fragmentSize > static_cast<int32_t>(maxAnomFactor*ppr.max)) return false; if (fragmentSize < ppr.min) return false; return true; }
bool SVLocusScanner:: isSVEvidence( const bam_record& bamRead, const unsigned defaultReadGroupIndex, const reference_contig_segment& refSeq, SVLocusEvidenceCount* incountsPtr) const { // exclude innie read pairs which are anomalously short: const bool isAnom(isNonCompressedAnomalous(bamRead,defaultReadGroupIndex)); const bool isSplit(bamRead.isSASplit()); getAlignment(bamRead,_bamAlign); const bool isIndel(isLocalIndelEvidence(_bamAlign)); const bool isAssm((_dopt.isSmallCandidates) && ((!isSplit) && isSemiAlignedEvidence(bamRead, _bamAlign, refSeq))); const bool isEvidence(isAnom || isSplit || isIndel || isAssm); if (nullptr != incountsPtr) { SVLocusEvidenceCount& incounts(*incountsPtr); incounts.total++; if (isAnom) incounts.anom++; if (isSplit) incounts.split++; if (isIndel) incounts.indel++; if (isAssm) incounts.assm++; if (! isEvidence) incounts.ignored++; if (isAnom) { if (isMateInsertionEvidenceCandidate(bamRead, getMinMapQ())) { // these counts are used to generate background noise rates in later candidate generation stages: incounts.remoteRecoveryCandidates++; } } } return isEvidence; }
static bool isGoodShadow( const bam_record& bamRead, const std::string& lastQname) { #ifdef DEBUG_IS_SHADOW static const std::string logtag("isGoodShadow"); #endif if (! bamRead.is_paired()) return false; if (bamRead.isNonStrictSupplement()) return false; // sanity check that this is a shadow read: if (!bamRead.is_unmapped()) return false; if (bamRead.is_mate_unmapped()) return false; static const unsigned minAvgQualShadow = 25; if (get_avg_quality(bamRead) < minAvgQualShadow) { return false; } if (strcmp(bamRead.qname(),lastQname.c_str()) != 0) { // something went wrong here, shadows should have their singleton partner // preceding them in the BAM file. #ifdef DEBUG_IS_SHADOW log_os << logtag << " ERROR: Shadow without matching singleton : " << bamRead.qname() << " vs " << lastQname << std::endl; #endif return false; } #ifdef DEBUG_IS_SHADOW log_os << logtag << " Found shadow!\n"; << logtag << " this mapq = " << ((unsigned int)bamRead.map_qual()) << std::endl;
/// get SV candidates from anomalous read pairs static void getSVCandidatesFromPair( const ReadScannerOptions& opt, const ReadScannerDerivOptions& dopt, const SVLocusScanner::CachedReadGroupStats& rstats, const bam_record& localRead, const SimpleAlignment& localAlign, const bam_record* remoteReadPtr, std::vector<SVObservation>& candidates) { if (! localRead.is_paired()) return; // don't count paired end evidence from SA-split reads twice: if (localRead.isNonStrictSupplement()) return; if (localRead.is_unmapped() || localRead.is_mate_unmapped()) return; // special case typically used for RNA-Seq analysis: if (opt.isIgnoreAnomProperPair && localRead.is_proper_pair()) return; // abstract remote alignment to SimpleAlignment object: const bool isRemote(nullptr != remoteReadPtr); const SimpleAlignment remoteAlign(isRemote ? getAlignment(*remoteReadPtr) : getFakeMateAlignment(localRead)); AlignmentPairAnalyzer pairInspector(opt, dopt, rstats); pairInspector.reset(localAlign, remoteAlign, isRemote, localRead.is_first()); if (! pairInspector.computeLargeEventRegionScale()) return; candidates.emplace_back(); pairInspector.getSVObservation(candidates.back()); #ifdef DEBUG_SCANNER log_os << __FUNCTION__ << " evaluating pair sv for inclusion: " << candidates.back() << "\n"; #endif }
void buildTestBamRecord( bam_record& bamRead, int targetID, int pos, int mateTargetID, int matePos, int fragmentSize, int mapQ, std::string cigarString, std::string querySeq) { bam1_t& bamData(*(bamRead.get_data())); // set qname { edit_bam_qname("buildTestBamRecord", bamData); } // set CIGAR { if (cigarString.empty()) { cigarString = std::to_string(fragmentSize) + "M"; } ALIGNPATH::path_t inputPath; cigar_to_apath(cigarString.c_str(), inputPath); edit_bam_cigar(inputPath, bamData); } // set read and qual { if ( querySeq.empty() ) { querySeq = std::string(fragmentSize,'A'); } const unsigned querySize(querySeq.length()); // initialize test qual array to all Q30's: std::unique_ptr<uint8_t[]> qual(new uint8_t[querySize]); for (unsigned i(0); i<querySize; ++i) { qual[i] = 30; } edit_bam_read_and_quality(querySeq.c_str(), qual.get(), bamData); } // Set some defaults for the read bamRead.toggle_is_paired(); bamRead.toggle_is_mate_fwd_strand(); bamData.core.pos = pos; bamData.core.isize = fragmentSize; bamData.core.qual = mapQ; bamRead.set_target_id(targetID); // Set mate info bamData.core.mtid = mateTargetID; bamData.core.mpos = matePos; static const char nhTag[] = {'N','H'}; static const char nmTag[] = {'N','M'}; static const char rgTag[] = {'R','G'}; bam_aux_append_unsigned(bamData, nhTag, 1); bam_aux_append_unsigned(bamData, nmTag, 1); bam_aux_append_unsigned(bamData, rgTag, 1); }
/// get SV candidates from shadow/singleton pairs /// look for singletons, create candidateSV around conf. interval of shadow position /// cache singletons? might be needed to remove poor quality shadows. /// should be able to re-use code, follow soft-clipping example. static void getSVCandidatesFromShadow( const ReadScannerOptions& opt, const SVLocusScanner::CachedReadGroupStats& rstats, const bam_record& localRead, const SimpleAlignment& localAlign, const bam_record* remoteReadPtr, TrackedCandidates& candidates) { using namespace SVEvidenceType; static const index_t svSource(SHADOW); static const bool isComplex(true); pos_t singletonGenomePos(0); int targetId(0); if (NULL == remoteReadPtr) { if (!localRead.is_unmapped()) return; // need to take care of this case // need to rely on cached mapq and qname return; if (!isGoodShadow(localRead,lastMapq,lastQname,opt.minSingletonMapqGraph)) { return; } singletonGenomePos = localAlign.pos; targetId = localRead.target_id(); } else { // have both reads, straightforward from here const bam_record& remoteRead(*remoteReadPtr); const SimpleAlignment remoteAlign(remoteRead); if (localRead.is_mate_unmapped()) { // remote read is shadow candidate if (!isGoodShadow(remoteRead,localRead.map_qual(),localRead.qname(),opt.minSingletonMapqGraph)) { return; } singletonGenomePos = localAlign.pos; targetId = remoteRead.target_id(); } else if (localRead.is_unmapped()) { // local is shadow candidate if (!isGoodShadow(localRead,remoteRead.map_qual(),remoteRead.qname(),opt.minSingletonMapqGraph)) { return; } singletonGenomePos = remoteAlign.pos; targetId = localRead.target_id(); } else { // none unmapped, skip this one return; } } const pos_t properPairRangeOffset = static_cast<pos_t>(rstats.properPair.min + (rstats.properPair.max-rstats.properPair.min)/2); const pos_t shadowGenomePos = singletonGenomePos + properPairRangeOffset; candidates.push_back(GetSplitSVCandidate(opt,targetId,shadowGenomePos,shadowGenomePos, svSource, isComplex)); }
static void parseSACandidatesFromRead( const ReadScannerOptions& opt, const bam_record& bamRead, const chromMap_t& chromToIndex, std::vector<SimpleAlignment>& splitAlign) { using namespace ALIGNPATH; splitAlign.clear(); std::vector<std::string> saVec; { static const char satag[] = {'S','A'}; const char* saStr(bamRead.get_string_tag(satag)); if (nullptr == saStr) return; split_string(saStr, ';', saVec); if ( (! saVec.empty()) && saVec.back().empty()) { saVec.pop_back(); } } // Only handle a single split alignment right now. // In the future we could sort the SA tags by order on the template, possibly // also removing segments that map to two different areas, if (saVec.size() > 1) return; for (const std::string& sa : saVec) { #ifdef DEBUG_SCANNER log_os << __FUNCTION__ << ": SA STRING: " << sa << "\n"; #endif std::vector<std::string> saDat; split_string(sa, ',', saDat); assert((saDat.size() == 6) && "Unexpected number of SA tag values"); /// filter split reads with low MAPQ: const unsigned saMapq(illumina::blt_util::parse_unsigned_str(saDat[4])); if (saMapq < opt.minMapq) continue; const chromMap_t::const_iterator ci(chromToIndex.find(saDat[0])); assert(ci != chromToIndex.end()); splitAlign.emplace_back(); SimpleAlignment& sal(splitAlign.back()); sal.tid=(ci->second); // convert chr to int32_t via new bam header map sal.pos = (illumina::blt_util::parse_int_str(saDat[1])-1); { const char saStrand(saDat[2][0]); // convert to char assert((saStrand=='-') || (saStrand=='+')); sal.is_fwd_strand = (saStrand == '+'); } cigar_to_apath(saDat[3].c_str(), sal.path); } }
void SVCandidateSetSequenceFragmentSampleGroup:: add( const bam_header_info& bamHeader, const bam_record& bamRead, const bool isExpectRepeat, const bool isSourcedFromGraphEdgeNode1, const bool isSubMapped) { using namespace illumina::common; #ifdef DEBUG_SVDATA log_os << "SVDataGroup adding: " << bamRead << "\n"; #endif SVCandidateSetSequenceFragment* fragPtr(getSequenceFragment(bamRead.qname())); if (nullptr == fragPtr) return; SVCandidateSetSequenceFragment& fragment(*fragPtr); SVCandidateSetRead* targetReadPtr(nullptr); if (2 == bamRead.read_no()) { if (bamRead.isNonStrictSupplement()) { fragment.read2Supplemental.emplace_back(); targetReadPtr = (&(fragment.read2Supplemental.back())); } else { targetReadPtr = (&(fragment.read2)); } } else { if (bamRead.isNonStrictSupplement()) { fragment.read1Supplemental.emplace_back(); targetReadPtr = (&(fragment.read1Supplemental.back())); } else { targetReadPtr = (&(fragment.read1)); } } SVCandidateSetRead& targetRead(*targetReadPtr); if (targetRead.isSet()) { if (isExpectRepeat) return; std::ostringstream oss; oss << "Unexpected alignment name collision. Source: '" << dataSourceName << "'\n" << "\tExisting read: "; summarizeAlignmentRecord(bamHeader, targetRead.bamrec, oss); oss << "\n" << "\tNew read: "; summarizeAlignmentRecord(bamHeader, bamRead, oss); oss << "\n"; BOOST_THROW_EXCEPTION(GeneralException(oss.str())); } targetRead.bamrec = bamRead; targetRead.isSourcedFromGraphEdgeNode1 = isSourcedFromGraphEdgeNode1; targetRead.isSubMapped = isSubMapped; targetRead.readIndex = (isSubMapped ? _subMappedReadIndex : _mappedReadIndex); }
bool isMateInsertionEvidenceCandidate( const bam_record& bamRead, const unsigned minMapq) { if (! bamRead.is_paired()) return false; if (bamRead.isNonStrictSupplement()) return false; if (bamRead.is_unmapped() || bamRead.is_mate_unmapped()) return false; if (bamRead.map_qual() < minMapq) return false; if (bamRead.target_id() < 0) return false; if (bamRead.mate_target_id() < 0) return false; if (bamRead.target_id() != bamRead.mate_target_id()) return true; /// TODO: better candidate definition based on fragment size distro: static const int minSize(10000); return (std::abs(bamRead.pos()-bamRead.mate_pos()) >= minSize); }
std::pair<bool,align_id_t> starling_read_buffer:: add_read_alignment(const starling_options& opt, const bam_record& br, const alignment& al, const MAPLEVEL::index_t maplev, const READ_ALIGN::index_t rat, const align_id_t contig_id) { assert(! br.is_unmapped()); const bool is_genomic(READ_ALIGN::GENOME == rat); align_id_t this_read_id; bool is_key_found(false); if(opt.is_ignore_read_names) { this_read_id=next_id(); _read_data[this_read_id] = new starling_read(br,is_genomic); } else { const read_key tmp_key(br); const read_key_lup_t::const_iterator i(_read_key.find(tmp_key)); is_key_found=(i!=_read_key.end()); if(! is_key_found) { this_read_id=next_id(); _read_data[this_read_id] = new starling_read(br,is_genomic); } else { this_read_id=i->second; } starling_read& sread(*(_read_data[this_read_id])); if(! is_key_found) { _read_key[sread.key()]=this_read_id; } else { assert(sread.key() == tmp_key); } } starling_read& sread(*(_read_data[this_read_id])); if(! is_key_found) { sread.id() = this_read_id; } else { { // no GROUPER input accepted for reads crossing splice junctions: bool is_spliced_contig_read(false); if(is_genomic) { if((! sread.contig_align().empty()) && (apath_exon_count(al.path)>1)) is_spliced_contig_read=true; } else { if(sread.is_segmented()) is_spliced_contig_read=true; } if(is_spliced_contig_read) { log_os << "ERROR: assembled contig realignments are not allowed for splice junction reads. Read: " << sread.key() << "\n"; exit(EXIT_FAILURE); } } if(! sread.is_compatible_alignment(al,rat,contig_id,opt)) { log_os << "WARNING: skipping new alignment: " << al << " which is incompatible with alignments in read: " << sread; return std::make_pair(false,0); } // contig BAM records are incomplete, so we want to fill in // the full record if there's a mapped genomic alignment // available: if(is_genomic) sread.set_genomic_bam_record(br); } if(is_genomic) { sread.set_genome_align(al); sread.genome_align_maplev = maplev; // deal with segmented reads now: if(sread.is_segmented()) { const uint8_t n_seg(sread.segment_count()); for(unsigned i(0); i<n_seg; ++i) { const uint8_t seg_no(i+1); const pos_t seg_buffer_pos(get_alignment_buffer_pos(sread.get_segment(seg_no).genome_align())); sread.get_segment(seg_no).buffer_pos = seg_buffer_pos; (_pos_group[seg_buffer_pos]).insert(std::make_pair(this_read_id,seg_no)); } } } else { // contig alignments: sread.contig_align()[contig_id] = al; (_contig_group[contig_id]).insert(this_read_id); } if((! is_key_found) && (! sread.is_segmented())) { const pos_t buffer_pos(get_alignment_buffer_pos(al)); const seg_id_t seg_id(0); sread.get_full_segment().buffer_pos = buffer_pos; (_pos_group[buffer_pos]).insert(std::make_pair(this_read_id,seg_id)); } return std::make_pair(true,this_read_id); }
/// scan read record (and optionally its mate record) for SV evidence. // /// note that estimation is improved by the mate record (because we have the mate cigar string in this case) /// static void getReadBreakendsImpl( const ReadScannerOptions& opt, const ReadScannerDerivOptions& dopt, const SVLocusScanner::CachedReadGroupStats& rstats, const bam_record& localRead, const bam_record* remoteReadPtr, const bam_header_info& bamHeader, const reference_contig_segment& localRefSeq, const reference_contig_segment* remoteRefSeqPtr, std::vector<SVObservation>& candidates, known_pos_range2& localEvidenceRange) { using namespace illumina::common; #ifdef DEBUG_SCANNER log_os << __FUNCTION__ << ": Starting read: " << localRead.qname() << "\n"; #endif const chromMap_t& chromToIndex(bamHeader.chrom_to_index); candidates.clear(); /// get some basic derived information from the bam_record: const SimpleAlignment localAlign(getAlignment(localRead)); try { getSingleReadSVCandidates(opt, dopt, localRead, localAlign, chromToIndex, localRefSeq, candidates); // run the same check on the read's mate if we have access to it if (nullptr != remoteReadPtr) { const bam_record& remoteRead(*remoteReadPtr); const SimpleAlignment remoteAlign(getAlignment(remoteRead)); if (nullptr == remoteRefSeqPtr) { static const char msg[] = "ERROR: remoteRefSeqPtr cannot be null"; BOOST_THROW_EXCEPTION(LogicException(msg)); } getSingleReadSVCandidates(opt, dopt, remoteRead, remoteAlign, chromToIndex, (*remoteRefSeqPtr), candidates); } // process shadows: //getSVCandidatesFromShadow(opt, rstats, localRead, localAlign,remoteReadPtr,candidates); // - process anomalous read pairs: getSVCandidatesFromPair(opt, dopt, rstats, localRead, localAlign, remoteReadPtr, candidates); } catch (...) { std::cerr << "ERROR: Exception caught while processing "; if (nullptr == remoteReadPtr) { std::cerr << "single read record:\n" << '\t' << localRead << "\n"; } else { std::cerr << " read pair records:\n" << '\t' << localRead << "\n" << '\t' << (*remoteReadPtr) << "\n"; } throw; } #ifdef DEBUG_SCANNER log_os << __FUNCTION__ << ": post-pair candidate_size: " << candidates.size() << "\n"; #endif // update localEvidence range: // note this is only used if candidates were added, so there's no harm in setting it every time: const unsigned localRefLength(apath_ref_length(localAlign.path)); const pos_t startRefPos(localRead.pos()-1); const pos_t endRefPos(startRefPos+localRefLength); localEvidenceRange.set_range(startRefPos,endRefPos); const int maxTid(chromToIndex.size()); /// final chance to QC candidate set: /// for (const SVCandidate& sv : candidates) { bool isInvalidTid(false); if ((sv.bp1.interval.tid < 0) || (sv.bp1.interval.tid >= maxTid)) { isInvalidTid=true; } else if (sv.bp2.state != SVBreakendState::UNKNOWN) { if ((sv.bp2.interval.tid < 0) || (sv.bp2.interval.tid >= maxTid)) { isInvalidTid=true; } } bool isInvalidPos(false); if (! isInvalidTid) { // note in the 'off-chromosome edge' test below we check for cases which are obviously way off // the edge, but allow for a bit of over-edge mistakes to occur for the circular chromosomes // static const int offEdgePad(500); const pos_t tid1Length(bamHeader.chrom_data[sv.bp1.interval.tid].length); if ((sv.bp1.interval.range.end_pos() <= -offEdgePad) || (sv.bp1.interval.range.begin_pos() >= (tid1Length+offEdgePad))) { isInvalidPos=true; } else if (sv.bp2.state != SVBreakendState::UNKNOWN) { const pos_t tid2Length(bamHeader.chrom_data[sv.bp2.interval.tid].length); if ((sv.bp2.interval.range.end_pos() <= -offEdgePad) || (sv.bp2.interval.range.begin_pos() >= (tid2Length+offEdgePad))) { isInvalidPos=true; } } } if (isInvalidTid || isInvalidPos) { std::ostringstream oss; if (isInvalidTid) { oss << "SVbreakend has unknown or invalid chromosome id in candidate sv.\n"; } else { oss << "Cannot interpret BAM record: candidate SV breakend from BAM record is off chromosome edge.\n"; } oss << "\tlocal_bam_record: " << localRead << "\n" << "\tremote_bam record: "; if (NULL==remoteReadPtr) { oss << "NONE"; } else { oss << (*remoteReadPtr); } oss << "\n" << "\tSVCandidate: " << sv << "\n"; BOOST_THROW_EXCEPTION(LogicException(oss.str())); } } }
/// Create an SVLocus for each potential SV event supported by the BAM record /// /// the loci count should almost always be one (or, depending on input filtration, zero). /// multiple suggested loci from one read is more of a theoretical possibility than an /// expectation. /// static void getSVLociImpl( const ReadScannerOptions& opt, const ReadScannerDerivOptions& dopt, const SVLocusScanner::CachedReadGroupStats& rstats, const bam_record& bamRead, const bam_header_info& bamHeader, const reference_contig_segment& refSeq, std::vector<SVLocus>& loci, SampleEvidenceCounts& eCounts) { using namespace illumina::common; loci.clear(); std::vector<SVObservation> candidates; known_pos_range2 localEvidenceRange; getReadBreakendsImpl(opt, dopt, rstats, bamRead, nullptr, bamHeader, refSeq, nullptr, candidates, localEvidenceRange); #ifdef DEBUG_SCANNER log_os << __FUNCTION__ << ": candidate_size: " << candidates.size() << "\n"; #endif // translate SVCandidate to a simpler form for use // in the SV locus graph: for (const SVCandidate& cand : candidates) { const bool isCandComplex(isComplexSV(cand)); const SVBreakend& localBreakend(cand.bp1); const SVBreakend& remoteBreakend(cand.bp2); if ((0==localBreakend.interval.range.size()) || ((! isCandComplex) && (0==remoteBreakend.interval.range.size()))) { std::ostringstream oss; oss << "Unexpected breakend pattern proposed from bam record.\n" << "\tlocal_breakend: " << localBreakend << "\n" << "\tremote_breakend: " << remoteBreakend << "\n" << "\tbam_record: " << bamRead << "\n"; BOOST_THROW_EXCEPTION(LogicException(oss.str())); } // update evidence stats: for (int i(0); i< SVEvidenceType::SIZE; ++i) { eCounts.eType[i] += localBreakend.lowresEvidence.getVal(i); } // determine the evidence weight of this candidate: unsigned localEvidenceWeight(0); unsigned remoteEvidenceWeight(0); if (localBreakend.getAnyNonPairCount() != 0) { localEvidenceWeight = SVObservationWeights::internalReadEvent; if (remoteBreakend.getAnyNonPairCount() != 0) { remoteEvidenceWeight = SVObservationWeights::internalReadEvent; } } else if (localBreakend.getLocalPairCount() != 0) { bool isClose(false); if (is_innie_pair(bamRead)) { isClose = (std::abs(bamRead.template_size()) < rstats.minDistantFragmentSize); } unsigned thisWeight(SVObservationWeights::readPair); if (isClose) { thisWeight = SVObservationWeights::closeReadPair; eCounts.closeCount += 1; } localEvidenceWeight = thisWeight; if (remoteBreakend.getLocalPairCount() != 0) { remoteEvidenceWeight = thisWeight; } } // finally, create the graph locus: SVLocus locus; // set local breakend estimate: const NodeIndexType localBreakendNode(locus.addNode(localBreakend.interval)); locus.setNodeEvidence(localBreakendNode,localEvidenceRange); if (isCandComplex) { locus.linkNodes(localBreakendNode,localBreakendNode,localEvidenceWeight); } else { // set remote breakend estimate: const NodeIndexType remoteBreakendNode(locus.addNode(remoteBreakend.interval)); locus.linkNodes(localBreakendNode,remoteBreakendNode,localEvidenceWeight,remoteEvidenceWeight); locus.mergeSelfOverlap(); } #ifdef DEBUG_SCANNER log_os << __FUNCTION__ << ": adding Locus: " << locus << "\n"; #endif loci.push_back(locus); } }