bool isMateInsertionEvidenceCandidate( const bam_record& bamRead, const unsigned minMapq) { if (! bamRead.is_paired()) return false; if (bamRead.isNonStrictSupplement()) return false; if (bamRead.is_unmapped() || bamRead.is_mate_unmapped()) return false; if (bamRead.map_qual() < minMapq) return false; if (bamRead.target_id() < 0) return false; if (bamRead.mate_target_id() < 0) return false; if (bamRead.target_id() != bamRead.mate_target_id()) return true; /// TODO: better candidate definition based on fragment size distro: static const int minSize(10000); return (std::abs(bamRead.pos()-bamRead.mate_pos()) >= minSize); }
static bool isGoodShadow( const bam_record& bamRead, const std::string& lastQname) { #ifdef DEBUG_IS_SHADOW static const std::string logtag("isGoodShadow"); #endif if (! bamRead.is_paired()) return false; if (bamRead.isNonStrictSupplement()) return false; // sanity check that this is a shadow read: if (!bamRead.is_unmapped()) return false; if (bamRead.is_mate_unmapped()) return false; static const unsigned minAvgQualShadow = 25; if (get_avg_quality(bamRead) < minAvgQualShadow) { return false; } if (strcmp(bamRead.qname(),lastQname.c_str()) != 0) { // something went wrong here, shadows should have their singleton partner // preceding them in the BAM file. #ifdef DEBUG_IS_SHADOW log_os << logtag << " ERROR: Shadow without matching singleton : " << bamRead.qname() << " vs " << lastQname << std::endl; #endif return false; } #ifdef DEBUG_IS_SHADOW log_os << logtag << " Found shadow!\n"; << logtag << " this mapq = " << ((unsigned int)bamRead.map_qual()) << std::endl;
/// get SV candidates from anomalous read pairs static void getSVCandidatesFromPair( const ReadScannerOptions& opt, const ReadScannerDerivOptions& dopt, const SVLocusScanner::CachedReadGroupStats& rstats, const bam_record& localRead, const SimpleAlignment& localAlign, const bam_record* remoteReadPtr, std::vector<SVObservation>& candidates) { if (! localRead.is_paired()) return; // don't count paired end evidence from SA-split reads twice: if (localRead.isNonStrictSupplement()) return; if (localRead.is_unmapped() || localRead.is_mate_unmapped()) return; // special case typically used for RNA-Seq analysis: if (opt.isIgnoreAnomProperPair && localRead.is_proper_pair()) return; // abstract remote alignment to SimpleAlignment object: const bool isRemote(nullptr != remoteReadPtr); const SimpleAlignment remoteAlign(isRemote ? getAlignment(*remoteReadPtr) : getFakeMateAlignment(localRead)); AlignmentPairAnalyzer pairInspector(opt, dopt, rstats); pairInspector.reset(localAlign, remoteAlign, isRemote, localRead.is_first()); if (! pairInspector.computeLargeEventRegionScale()) return; candidates.emplace_back(); pairInspector.getSVObservation(candidates.back()); #ifdef DEBUG_SCANNER log_os << __FUNCTION__ << " evaluating pair sv for inclusion: " << candidates.back() << "\n"; #endif }
/// get SV candidates from shadow/singleton pairs /// look for singletons, create candidateSV around conf. interval of shadow position /// cache singletons? might be needed to remove poor quality shadows. /// should be able to re-use code, follow soft-clipping example. static void getSVCandidatesFromShadow( const ReadScannerOptions& opt, const SVLocusScanner::CachedReadGroupStats& rstats, const bam_record& localRead, const SimpleAlignment& localAlign, const bam_record* remoteReadPtr, TrackedCandidates& candidates) { using namespace SVEvidenceType; static const index_t svSource(SHADOW); static const bool isComplex(true); pos_t singletonGenomePos(0); int targetId(0); if (NULL == remoteReadPtr) { if (!localRead.is_unmapped()) return; // need to take care of this case // need to rely on cached mapq and qname return; if (!isGoodShadow(localRead,lastMapq,lastQname,opt.minSingletonMapqGraph)) { return; } singletonGenomePos = localAlign.pos; targetId = localRead.target_id(); } else { // have both reads, straightforward from here const bam_record& remoteRead(*remoteReadPtr); const SimpleAlignment remoteAlign(remoteRead); if (localRead.is_mate_unmapped()) { // remote read is shadow candidate if (!isGoodShadow(remoteRead,localRead.map_qual(),localRead.qname(),opt.minSingletonMapqGraph)) { return; } singletonGenomePos = localAlign.pos; targetId = remoteRead.target_id(); } else if (localRead.is_unmapped()) { // local is shadow candidate if (!isGoodShadow(localRead,remoteRead.map_qual(),remoteRead.qname(),opt.minSingletonMapqGraph)) { return; } singletonGenomePos = remoteAlign.pos; targetId = localRead.target_id(); } else { // none unmapped, skip this one return; } } const pos_t properPairRangeOffset = static_cast<pos_t>(rstats.properPair.min + (rstats.properPair.max-rstats.properPair.min)/2); const pos_t shadowGenomePos = singletonGenomePos + properPairRangeOffset; candidates.push_back(GetSplitSVCandidate(opt,targetId,shadowGenomePos,shadowGenomePos, svSource, isComplex)); }
std::pair<bool,align_id_t> starling_read_buffer:: add_read_alignment(const starling_options& opt, const bam_record& br, const alignment& al, const MAPLEVEL::index_t maplev, const READ_ALIGN::index_t rat, const align_id_t contig_id) { assert(! br.is_unmapped()); const bool is_genomic(READ_ALIGN::GENOME == rat); align_id_t this_read_id; bool is_key_found(false); if(opt.is_ignore_read_names) { this_read_id=next_id(); _read_data[this_read_id] = new starling_read(br,is_genomic); } else { const read_key tmp_key(br); const read_key_lup_t::const_iterator i(_read_key.find(tmp_key)); is_key_found=(i!=_read_key.end()); if(! is_key_found) { this_read_id=next_id(); _read_data[this_read_id] = new starling_read(br,is_genomic); } else { this_read_id=i->second; } starling_read& sread(*(_read_data[this_read_id])); if(! is_key_found) { _read_key[sread.key()]=this_read_id; } else { assert(sread.key() == tmp_key); } } starling_read& sread(*(_read_data[this_read_id])); if(! is_key_found) { sread.id() = this_read_id; } else { { // no GROUPER input accepted for reads crossing splice junctions: bool is_spliced_contig_read(false); if(is_genomic) { if((! sread.contig_align().empty()) && (apath_exon_count(al.path)>1)) is_spliced_contig_read=true; } else { if(sread.is_segmented()) is_spliced_contig_read=true; } if(is_spliced_contig_read) { log_os << "ERROR: assembled contig realignments are not allowed for splice junction reads. Read: " << sread.key() << "\n"; exit(EXIT_FAILURE); } } if(! sread.is_compatible_alignment(al,rat,contig_id,opt)) { log_os << "WARNING: skipping new alignment: " << al << " which is incompatible with alignments in read: " << sread; return std::make_pair(false,0); } // contig BAM records are incomplete, so we want to fill in // the full record if there's a mapped genomic alignment // available: if(is_genomic) sread.set_genomic_bam_record(br); } if(is_genomic) { sread.set_genome_align(al); sread.genome_align_maplev = maplev; // deal with segmented reads now: if(sread.is_segmented()) { const uint8_t n_seg(sread.segment_count()); for(unsigned i(0); i<n_seg; ++i) { const uint8_t seg_no(i+1); const pos_t seg_buffer_pos(get_alignment_buffer_pos(sread.get_segment(seg_no).genome_align())); sread.get_segment(seg_no).buffer_pos = seg_buffer_pos; (_pos_group[seg_buffer_pos]).insert(std::make_pair(this_read_id,seg_no)); } } } else { // contig alignments: sread.contig_align()[contig_id] = al; (_contig_group[contig_id]).insert(this_read_id); } if((! is_key_found) && (! sread.is_segmented())) { const pos_t buffer_pos(get_alignment_buffer_pos(al)); const seg_id_t seg_id(0); sread.get_full_segment().buffer_pos = buffer_pos; (_pos_group[buffer_pos]).insert(std::make_pair(this_read_id,seg_id)); } return std::make_pair(true,this_read_id); }