Esempio n. 1
0
static
void
addReadToDepthEst(
    const bam_record& bamRead,
    const pos_t beginPos,
    std::vector<unsigned>& depth)
{
    using namespace ALIGNPATH;

    const pos_t endPos(beginPos+depth.size());

    // get cigar:
    path_t apath;
    bam_cigar_to_apath(bamRead.raw_cigar(), bamRead.n_cigar(), apath);

    pos_t refPos(bamRead.pos()-1);
    BOOST_FOREACH(const path_segment& ps, apath)
    {
        if (refPos>=endPos) return;

        if (MATCH == ps.type)
        {
            for (pos_t pos(refPos); pos < (refPos+static_cast<pos_t>(ps.length)); ++pos)
            {
                if (pos>=beginPos)
                {
                    if (pos>=endPos) return;
                    depth[pos-beginPos]++;
                }
            }
        }
        if (is_segment_type_ref_length(ps.type)) refPos += ps.length;
    }
}
Esempio n. 2
0
void
SVScorePairRefProcessor::
processClearedRecord(
    const bam_record& bamRead)
{
    using namespace illumina::common;

    assert(bamParams.isSet);

    const pos_t refPos(bamRead.pos()-1);
    if (! bamParams.interval.range.is_pos_intersect(refPos)) return;

    const bool isLargeInsert(isLargeInsertSV(sv));

#ifdef DEBUG_MEGAPAIR
    log_os << __FUNCTION__ << ": read: " << bamRead << "\n";
#endif

    /// check if fragment is too big or too small:
    const int templateSize(std::abs(bamRead.template_size()));
    if (templateSize < bamParams.minFrag) return;
    if (templateSize > bamParams.maxFrag) return;

    // count only from the down stream reads
    const bool isFirstBamRead(isFirstRead(bamRead));

    // get fragment range:
    pos_t fragBeginRefPos(refPos);
    if (! isFirstBamRead)
    {
        fragBeginRefPos=bamRead.mate_pos()-1;
    }

    const pos_t fragEndRefPos(fragBeginRefPos+templateSize);

    if (fragBeginRefPos > fragEndRefPos)
    {
        std::ostringstream oss;
        oss << "ERROR: Failed to parse fragment range from bam record. Frag begin,end: " << fragBeginRefPos << " " << fragEndRefPos << " bamRecord: " << bamRead << "\n";
        BOOST_THROW_EXCEPTION(LogicException(oss.str()));
    }

    {
        const pos_t fragOverlap(std::min((1+svParams.centerPos-fragBeginRefPos), (fragEndRefPos-svParams.centerPos)));
#ifdef DEBUG_MEGAPAIR
        log_os << __FUNCTION__ << ": frag begin/end/overlap: " << fragBeginRefPos << " " << fragEndRefPos << " " << fragOverlap << "\n";
#endif
        if (fragOverlap < pairOpt.minFragSupport) return;
    }

    SVFragmentEvidence& fragment(evidence.getSampleEvidence(bamParams.bamIndex)[bamRead.qname()]);

    static const bool isShadow(false);

    SVFragmentEvidenceRead& evRead(fragment.getRead(bamRead.is_first()));
    setReadEvidence(svParams.minMapQ, svParams.minTier2MapQ, bamRead, isShadow, evRead);

    setAlleleFrag(*bamParams.fragDistroPtr, templateSize, fragment.ref.getBp(isBp1),isLargeInsert);
}
Esempio n. 3
0
bool
isMateInsertionEvidenceCandidate2(
    const bam_record& bamRead,
    const bool isSearchForLeftOpen,
    const bool isSearchForRightOpen)
{
    if ((! isSearchForLeftOpen) && (! bamRead.is_fwd_strand())) return false;
    if ((! isSearchForRightOpen) && bamRead.is_fwd_strand()) return false;
    return true;
}
Esempio n. 4
0
    void
    addRead(
        const bam_record& bamRead)
    {
        if (_isRegionInit)
        {
            if (bamRead.pos() > _endPos + 1000)
            {
                _maxPos=_endPos;
                setNewRegion();
            }
        }

        if (! _isRegionInit)
        {
            _minPos=bamRead.pos();
            _maxPos=bamRead.pos();
            _endPos=bamRead.pos() + bamRead.read_size();
            _isRegionInit=true;
        }
        else
        {
            if (bamRead.pos() > _maxPos)
            {
                _maxPos = bamRead.pos();
                _endPos=bamRead.pos() + bamRead.read_size();
            }
        }

        _count++;
        _totalReadLength += bamRead.read_size();
    }
Esempio n. 5
0
FragmentSizeType::index_t
SVLocusScanner::
_getFragmentSizeType(
    const bam_record& bamRead,
    const unsigned defaultReadGroupIndex) const
{
    using namespace FragmentSizeType;
    if (bamRead.target_id() != bamRead.mate_target_id()) return DISTANT;
    const int32_t fragmentSize(std::abs(bamRead.template_size()));
    return classifySize(_stats[defaultReadGroupIndex], fragmentSize);
}
Esempio n. 6
0
    void
    addRead(
        const bam_record& bamRead)
    {
        const pos_t pos(bamRead.pos()-1);
        const unsigned rsize(bamRead.read_size());
        if (! _isRegionInit)
        {
            _maxPos=pos;
            _isRegionInit=true;
        }

        for (; _maxPos<pos; ++_maxPos) flushPos(_maxPos);
        _depth.inc(pos,rsize);
        _count++;
    }
Esempio n. 7
0
static
void
getSingleReadSVCandidates(
    const ReadScannerOptions& opt,
    const ReadScannerDerivOptions& dopt,
    const bam_record& localRead,
    const SimpleAlignment& localAlign,
    const chromMap_t& chromToIndex,
    const reference_contig_segment& refSeq,
    std::vector<SVObservation>& candidates)
{
    using namespace illumina::common;

    const bool isRead2(localRead.is_paired() && (localRead.read_no() == 2));
    const FRAGSOURCE::index_t fragSource(isRead2 ? FRAGSOURCE::READ2 : FRAGSOURCE::READ1);

    // - process any large indels in the localRead:
    getSVCandidatesFromReadIndels(opt, dopt, localAlign, fragSource, candidates);
#ifdef DEBUG_SCANNER
    log_os << __FUNCTION__ << ": post-indels candidate_size: " << candidates.size() << "\n";
#endif

    // a read can provide SA split evidence or semi-aligned/soft-clip, but not both.
    // this prevents split reads from triggering spurious local assembles. It is
    // possible for a read to genuinely contain evidence of both, but this should
    // be very rare.
    if (localRead.isSASplit())
    {
        getSACandidatesFromRead(opt, dopt, localRead, localAlign, fragSource, chromToIndex,
                                candidates);
#ifdef DEBUG_SCANNER
        log_os << __FUNCTION__ << ": post-split read candidate_size: " << candidates.size() << "\n";
#endif
    }
    else
    {
        if (dopt.isSmallCandidates)
        {
            getSVCandidatesFromSemiAligned(opt, dopt, localRead, localAlign, fragSource, refSeq,
                                           candidates);
        }
#ifdef DEBUG_SCANNER
        log_os << __FUNCTION__ << ": post-semialigned candidate_size: " << candidates.size() << "\n";
#endif
    }
}
Esempio n. 8
0
void
addSupplementaryAlignmentEvidence(
    bam_record& bamRead,
    const std::string& svStr)
{
    static const char svtag[] = {'S','A'};
    bam_aux_append(bamRead.get_data(),svtag,'Z',(svStr.size()+1),
                   (const uint8_t*)(svStr.c_str()));
}
Esempio n. 9
0
/// get SV candidates from SA-tag split-read alignment
static
SVObservation
GetSplitSACandidate(
    const ReadScannerDerivOptions& dopt,
    const bam_record& localRead,
    const SimpleAlignment& localAlign,
    const SimpleAlignment& remoteAlign,
    const FRAGSOURCE::index_t fragSource)
{
    using namespace SVEvidenceType;
    static const index_t svSource(SPLIT_ALIGN);

    SVObservation sv;
    sv.evtype = svSource;
    sv.fragSource = fragSource;

    SVBreakend& localBreakend(sv.bp1);
    SVBreakend& remoteBreakend(sv.bp2);

    // use single-side evidence, have to read the supp read to get the
    // reverse edge. this protects against double-count:
    localBreakend.lowresEvidence.add(svSource);

    updateSABreakend(dopt, localAlign, localBreakend);
    updateSABreakend(dopt, remoteAlign, remoteBreakend);

    // If the local (bp1) alignment is split downstream (on the right side) then this read goes from bp1 -> bp2.
    // If it is a forward read (e.g. read1 on + strand), this means it's a forward read for this event.
    const bool isSplitDownstream(isSplitOpenDownstream(localAlign.path));
    const bool isReadFw = (localRead.is_first() == localRead.is_fwd_strand());
    if (dopt.isStranded)
    {
        if (isReadFw == isSplitDownstream)
        {
            sv.fwReads += 1;
        }
        else
        {
            sv.rvReads += 1;
        }
    }
    return sv;
}
Esempio n. 10
0
/// extract poorly aligned read ends (semi-aligned and/or soft-clipped)
/// to internal candidate format
static
void
getSVCandidatesFromSemiAligned(
    const ReadScannerOptions& opt,
    const ReadScannerDerivOptions& dopt,
    const bam_record& bamRead,
    const SimpleAlignment& bamAlign,
    const FRAGSOURCE::index_t fragSource,
    const reference_contig_segment& refSeq,
    std::vector<SVObservation>& candidates)
{
    unsigned leadingMismatchLen(0);
    unsigned trailingMismatchLen(0);
    pos_t leadingRefPos(0), trailingRefPos(0);
    getSVBreakendCandidateSemiAligned(bamRead, bamAlign, refSeq,
                                      dopt.isUseOverlappingPairs,
                                      leadingMismatchLen, leadingRefPos,
                                      trailingMismatchLen, trailingRefPos);

    if ((leadingMismatchLen + trailingMismatchLen) >= bamRead.read_size()) return;

    using namespace SVEvidenceType;
    static const index_t svSource(SEMIALIGN);

    // semi-aligned reads don't define a full hypothesis, so they're always evidence for a 'complex' ie. undefined, event
    // in a fashion analogous to clipped reads
    static const bool isComplex(true);

    if (leadingMismatchLen >= opt.minSemiAlignedMismatchLen)
    {
        const pos_t pos(leadingRefPos);
        candidates.push_back(GetSplitSVCandidate(dopt,bamRead.target_id(),pos,pos,svSource, fragSource,isComplex));
    }

    if (trailingMismatchLen >= opt.minSemiAlignedMismatchLen)
    {
        const pos_t pos(trailingRefPos);
        candidates.push_back(GetSplitSVCandidate(dopt,bamRead.target_id(),pos,pos,svSource, fragSource,isComplex));
    }
}
Esempio n. 11
0
bool
SVLocusScanner::
isProperPair(
    const bam_record& bamRead,
    const unsigned defaultReadGroupIndex) const
{
    if (! is_innie_pair(bamRead)) return false;

    const Range& ppr(_stats[defaultReadGroupIndex].properPair);
    const int32_t fragmentSize(std::abs(bamRead.template_size()));

    // we're seeing way to much large fragment garbage in cancers to use
    // vanilla proper pair criteria, push the max fragment size out a bit for now:
    static const float maxAnomFactor(1.5);
    if (fragmentSize > static_cast<int32_t>(maxAnomFactor*ppr.max)) return false;
    if (fragmentSize < ppr.min) return false;

    return true;
}
Esempio n. 12
0
bool
SVLocusScanner::
isSVEvidence(
    const bam_record& bamRead,
    const unsigned defaultReadGroupIndex,
    const reference_contig_segment& refSeq,
    SVLocusEvidenceCount* incountsPtr) const
{
    // exclude innie read pairs which are anomalously short:
    const bool isAnom(isNonCompressedAnomalous(bamRead,defaultReadGroupIndex));
    const bool isSplit(bamRead.isSASplit());
    getAlignment(bamRead,_bamAlign);
    const bool isIndel(isLocalIndelEvidence(_bamAlign));
    const bool isAssm((_dopt.isSmallCandidates) && ((!isSplit) && isSemiAlignedEvidence(bamRead, _bamAlign, refSeq)));

    const bool isEvidence(isAnom || isSplit || isIndel || isAssm);

    if (nullptr != incountsPtr)
    {
        SVLocusEvidenceCount& incounts(*incountsPtr);
        incounts.total++;
        if (isAnom) incounts.anom++;
        if (isSplit) incounts.split++;
        if (isIndel) incounts.indel++;
        if (isAssm) incounts.assm++;

        if (! isEvidence) incounts.ignored++;

        if (isAnom)
        {
            if (isMateInsertionEvidenceCandidate(bamRead, getMinMapQ()))
            {
                // these counts are used to generate background noise rates in later candidate generation stages:
                incounts.remoteRecoveryCandidates++;
            }
        }
    }

    return isEvidence;
}
Esempio n. 13
0
static
bool
isGoodShadow(
    const bam_record& bamRead,
    const std::string& lastQname)
{
#ifdef DEBUG_IS_SHADOW
    static const std::string logtag("isGoodShadow");
#endif

    if (! bamRead.is_paired()) return false;

    if (bamRead.isNonStrictSupplement()) return false;

    // sanity check that this is a shadow read:
    if (!bamRead.is_unmapped()) return false;
    if (bamRead.is_mate_unmapped()) return false;

    static const unsigned minAvgQualShadow = 25;
    if (get_avg_quality(bamRead) < minAvgQualShadow)
    {
        return false;
    }

    if (strcmp(bamRead.qname(),lastQname.c_str()) != 0)
    {
        // something went wrong here, shadows should have their singleton partner
        // preceding them in the BAM file.
#ifdef DEBUG_IS_SHADOW
        log_os << logtag << " ERROR: Shadow without matching singleton : " << bamRead.qname() << " vs " << lastQname << std::endl;
#endif
        return false;
    }

#ifdef DEBUG_IS_SHADOW
    log_os << logtag << " Found shadow!\n";
            << logtag << " this mapq  = " << ((unsigned int)bamRead.map_qual()) << std::endl;
Esempio n. 14
0
/// get SV candidates from anomalous read pairs
static
void
getSVCandidatesFromPair(
    const ReadScannerOptions& opt,
    const ReadScannerDerivOptions& dopt,
    const SVLocusScanner::CachedReadGroupStats& rstats,
    const bam_record& localRead,
    const SimpleAlignment& localAlign,
    const bam_record* remoteReadPtr,
    std::vector<SVObservation>& candidates)
{
    if (! localRead.is_paired()) return;

    // don't count paired end evidence from SA-split reads twice:
    if (localRead.isNonStrictSupplement()) return;

    if (localRead.is_unmapped() || localRead.is_mate_unmapped()) return;

    // special case typically used for RNA-Seq analysis:
    if (opt.isIgnoreAnomProperPair && localRead.is_proper_pair()) return;

    // abstract remote alignment to SimpleAlignment object:
    const bool isRemote(nullptr != remoteReadPtr);
    const SimpleAlignment remoteAlign(isRemote ? getAlignment(*remoteReadPtr) : getFakeMateAlignment(localRead));

    AlignmentPairAnalyzer pairInspector(opt, dopt, rstats);
    pairInspector.reset(localAlign, remoteAlign, isRemote, localRead.is_first());

    if (! pairInspector.computeLargeEventRegionScale()) return;

    candidates.emplace_back();
    pairInspector.getSVObservation(candidates.back());

#ifdef DEBUG_SCANNER
    log_os << __FUNCTION__ << " evaluating pair sv for inclusion: " << candidates.back() << "\n";
#endif
}
Esempio n. 15
0
void
buildTestBamRecord(
    bam_record& bamRead,
    int targetID,
    int pos,
    int mateTargetID,
    int matePos,
    int fragmentSize,
    int mapQ,
    std::string cigarString,
    std::string querySeq)
{
    bam1_t& bamData(*(bamRead.get_data()));

    // set qname
    {
        edit_bam_qname("buildTestBamRecord", bamData);
    }

    // set CIGAR
    {
        if (cigarString.empty())
        {
            cigarString = std::to_string(fragmentSize) + "M";
        }

        ALIGNPATH::path_t inputPath;
        cigar_to_apath(cigarString.c_str(), inputPath);
        edit_bam_cigar(inputPath, bamData);
    }

    // set read and qual
    {
        if ( querySeq.empty() )
        {
            querySeq = std::string(fragmentSize,'A');
        }
        const unsigned querySize(querySeq.length());
        // initialize test qual array to all Q30's:
        std::unique_ptr<uint8_t[]> qual(new uint8_t[querySize]);
        for (unsigned i(0); i<querySize; ++i)
        {
            qual[i] = 30;
        }
        edit_bam_read_and_quality(querySeq.c_str(), qual.get(), bamData);
    }

    // Set some defaults for the read
    bamRead.toggle_is_paired();
    bamRead.toggle_is_mate_fwd_strand();
    bamData.core.pos = pos;
    bamData.core.isize = fragmentSize;
    bamData.core.qual = mapQ;
    bamRead.set_target_id(targetID);

    // Set mate info
    bamData.core.mtid = mateTargetID;
    bamData.core.mpos = matePos;

    static const char nhTag[] = {'N','H'};
    static const char nmTag[] = {'N','M'};
    static const char rgTag[] = {'R','G'};
    bam_aux_append_unsigned(bamData, nhTag, 1);
    bam_aux_append_unsigned(bamData, nmTag, 1);
    bam_aux_append_unsigned(bamData, rgTag, 1);
}
Esempio n. 16
0
/// get SV candidates from shadow/singleton pairs
/// look for singletons, create candidateSV around conf. interval of shadow position
/// cache singletons? might be needed to remove poor quality shadows.
/// should be able to re-use code, follow soft-clipping example.
static
void
getSVCandidatesFromShadow(
    const ReadScannerOptions& opt,
    const SVLocusScanner::CachedReadGroupStats& rstats,
    const bam_record& localRead,
    const SimpleAlignment& localAlign,
    const bam_record* remoteReadPtr,
    TrackedCandidates& candidates)
{
    using namespace SVEvidenceType;
    static const index_t svSource(SHADOW);

    static const bool isComplex(true);
    pos_t singletonGenomePos(0);
    int targetId(0);
    if (NULL == remoteReadPtr)
    {
        if (!localRead.is_unmapped()) return;
        // need to take care of this case
        // need to rely on cached mapq and qname
        return;
        if (!isGoodShadow(localRead,lastMapq,lastQname,opt.minSingletonMapqGraph))
        {
            return;
        }
        singletonGenomePos = localAlign.pos;
        targetId           = localRead.target_id();
    }
    else
    {
        // have both reads, straightforward from here
        const bam_record& remoteRead(*remoteReadPtr);
        const SimpleAlignment remoteAlign(remoteRead);

        if (localRead.is_mate_unmapped())
        {
            // remote read is shadow candidate
            if (!isGoodShadow(remoteRead,localRead.map_qual(),localRead.qname(),opt.minSingletonMapqGraph))
            {
                return;
            }
            singletonGenomePos = localAlign.pos;
            targetId = remoteRead.target_id();
        }
        else if (localRead.is_unmapped())
        {
            // local is shadow candidate
            if (!isGoodShadow(localRead,remoteRead.map_qual(),remoteRead.qname(),opt.minSingletonMapqGraph))
            {
                return;
            }
            singletonGenomePos = remoteAlign.pos;
            targetId = localRead.target_id();
        }
        else
        {
            // none unmapped, skip this one
            return;
        }
    }
    const pos_t properPairRangeOffset = static_cast<pos_t>(rstats.properPair.min + (rstats.properPair.max-rstats.properPair.min)/2);
    const pos_t shadowGenomePos = singletonGenomePos + properPairRangeOffset;
    candidates.push_back(GetSplitSVCandidate(opt,targetId,shadowGenomePos,shadowGenomePos, svSource, isComplex));
}
Esempio n. 17
0
static
void
parseSACandidatesFromRead(
    const ReadScannerOptions& opt,
    const bam_record& bamRead,
    const chromMap_t& chromToIndex,
    std::vector<SimpleAlignment>& splitAlign)
{
    using namespace ALIGNPATH;

    splitAlign.clear();

    std::vector<std::string> saVec;
    {
        static const char satag[] = {'S','A'};
        const char* saStr(bamRead.get_string_tag(satag));
        if (nullptr == saStr) return;

        split_string(saStr, ';', saVec);
        if ( (! saVec.empty()) && saVec.back().empty())
        {
            saVec.pop_back();
        }
    }

    // Only handle a single split alignment right now.
    // In the future we could sort the SA tags by order on the template, possibly
    // also removing segments that map to two different areas,

    if (saVec.size() > 1) return;

    for (const std::string& sa : saVec)
    {
#ifdef DEBUG_SCANNER
        log_os << __FUNCTION__ << ": SA STRING: " << sa << "\n";
#endif
        std::vector<std::string> saDat;
        split_string(sa, ',', saDat);

        assert((saDat.size() == 6) && "Unexpected number of SA tag values");

        /// filter split reads with low MAPQ:
        const unsigned saMapq(illumina::blt_util::parse_unsigned_str(saDat[4]));
        if (saMapq < opt.minMapq) continue;

        const chromMap_t::const_iterator ci(chromToIndex.find(saDat[0]));
        assert(ci != chromToIndex.end());

        splitAlign.emplace_back();
        SimpleAlignment& sal(splitAlign.back());
        sal.tid=(ci->second); // convert chr to int32_t via new bam header map
        sal.pos = (illumina::blt_util::parse_int_str(saDat[1])-1);
        {
            const char saStrand(saDat[2][0]); // convert to char
            assert((saStrand=='-') || (saStrand=='+'));
            sal.is_fwd_strand = (saStrand == '+');
        }

        cigar_to_apath(saDat[3].c_str(), sal.path);
    }
}
Esempio n. 18
0
void
SVCandidateSetSequenceFragmentSampleGroup::
add(
    const bam_header_info& bamHeader,
    const bam_record& bamRead,
    const bool isExpectRepeat,
    const bool isSourcedFromGraphEdgeNode1,
    const bool isSubMapped)
{
    using namespace illumina::common;

#ifdef DEBUG_SVDATA
    log_os << "SVDataGroup adding: " << bamRead << "\n";
#endif

    SVCandidateSetSequenceFragment* fragPtr(getSequenceFragment(bamRead.qname()));
    if (nullptr == fragPtr) return;

    SVCandidateSetSequenceFragment& fragment(*fragPtr);

    SVCandidateSetRead* targetReadPtr(nullptr);
    if (2 == bamRead.read_no())
    {
        if (bamRead.isNonStrictSupplement())
        {
            fragment.read2Supplemental.emplace_back();
            targetReadPtr = (&(fragment.read2Supplemental.back()));
        }
        else
        {
            targetReadPtr = (&(fragment.read2));
        }
    }
    else
    {
        if (bamRead.isNonStrictSupplement())
        {
            fragment.read1Supplemental.emplace_back();
            targetReadPtr = (&(fragment.read1Supplemental.back()));
        }
        else
        {
            targetReadPtr = (&(fragment.read1));
        }
    }

    SVCandidateSetRead& targetRead(*targetReadPtr);
    if (targetRead.isSet())
    {
        if (isExpectRepeat) return;

        std::ostringstream oss;
        oss << "Unexpected alignment name collision. Source: '" << dataSourceName << "'\n"
            << "\tExisting read: ";
        summarizeAlignmentRecord(bamHeader, targetRead.bamrec, oss);
        oss << "\n"
            << "\tNew read: ";
        summarizeAlignmentRecord(bamHeader, bamRead, oss);
        oss << "\n";
        BOOST_THROW_EXCEPTION(GeneralException(oss.str()));
    }

    targetRead.bamrec = bamRead;
    targetRead.isSourcedFromGraphEdgeNode1 = isSourcedFromGraphEdgeNode1;
    targetRead.isSubMapped = isSubMapped;
    targetRead.readIndex = (isSubMapped ? _subMappedReadIndex : _mappedReadIndex);
}
Esempio n. 19
0
bool
isMateInsertionEvidenceCandidate(
    const bam_record& bamRead,
    const unsigned minMapq)
{
    if (! bamRead.is_paired()) return false;
    if (bamRead.isNonStrictSupplement()) return false;
    if (bamRead.is_unmapped() || bamRead.is_mate_unmapped()) return false;

    if (bamRead.map_qual() < minMapq) return false;

    if (bamRead.target_id() < 0) return false;
    if (bamRead.mate_target_id() < 0) return false;

    if (bamRead.target_id() != bamRead.mate_target_id()) return true;

    /// TODO: better candidate definition based on fragment size distro:
    static const int minSize(10000);
    return (std::abs(bamRead.pos()-bamRead.mate_pos()) >= minSize);
}
std::pair<bool,align_id_t>
starling_read_buffer::
add_read_alignment(const starling_options& opt,
                   const bam_record& br,
                   const alignment& al,
                   const MAPLEVEL::index_t maplev,
                   const READ_ALIGN::index_t rat,
                   const align_id_t contig_id) {

    assert(! br.is_unmapped());

    const bool is_genomic(READ_ALIGN::GENOME == rat);
    align_id_t this_read_id;
    bool is_key_found(false);

    if(opt.is_ignore_read_names) {
        this_read_id=next_id();
        _read_data[this_read_id] = new starling_read(br,is_genomic);
    } else {
        const read_key tmp_key(br);
        const read_key_lup_t::const_iterator i(_read_key.find(tmp_key));
        is_key_found=(i!=_read_key.end());

        if(! is_key_found) {
            this_read_id=next_id();
            _read_data[this_read_id] = new starling_read(br,is_genomic);
        } else {
            this_read_id=i->second;
        }

        starling_read& sread(*(_read_data[this_read_id]));

        if(! is_key_found) {
            _read_key[sread.key()]=this_read_id;
        } else {
            assert(sread.key() == tmp_key);
        }
    }

    starling_read& sread(*(_read_data[this_read_id]));

    if(! is_key_found) {
        sread.id() = this_read_id;

    } else {
        {   // no GROUPER input accepted for reads crossing splice junctions:
            bool is_spliced_contig_read(false);
            if(is_genomic) {
                if((! sread.contig_align().empty()) &&
                   (apath_exon_count(al.path)>1)) is_spliced_contig_read=true;
            } else {
                if(sread.is_segmented()) is_spliced_contig_read=true;
            }

            if(is_spliced_contig_read) {
                log_os << "ERROR: assembled contig realignments are not allowed for splice junction reads. Read: " << sread.key() << "\n";
                exit(EXIT_FAILURE);
            }
        }

        if(! sread.is_compatible_alignment(al,rat,contig_id,opt)) {
            log_os << "WARNING: skipping new alignment: " << al
                   << " which is incompatible with alignments in read: " << sread;
            return std::make_pair(false,0);
        }

        // contig BAM records are incomplete, so we want to fill in
        // the full record if there's a mapped genomic alignment
        // available:
        if(is_genomic) sread.set_genomic_bam_record(br);
    }

    if(is_genomic) {
        sread.set_genome_align(al);
        sread.genome_align_maplev = maplev;

        // deal with segmented reads now:
        if(sread.is_segmented()) {
            const uint8_t n_seg(sread.segment_count());
            for(unsigned i(0); i<n_seg; ++i) {
                const uint8_t seg_no(i+1);
                const pos_t seg_buffer_pos(get_alignment_buffer_pos(sread.get_segment(seg_no).genome_align()));
                sread.get_segment(seg_no).buffer_pos = seg_buffer_pos;
                (_pos_group[seg_buffer_pos]).insert(std::make_pair(this_read_id,seg_no));
            }
        }
    } else {
        // contig alignments:
        sread.contig_align()[contig_id] = al;
        (_contig_group[contig_id]).insert(this_read_id);
    }

    if((! is_key_found) && (! sread.is_segmented())) {
        const pos_t buffer_pos(get_alignment_buffer_pos(al));
        const seg_id_t seg_id(0);
        sread.get_full_segment().buffer_pos = buffer_pos;
        (_pos_group[buffer_pos]).insert(std::make_pair(this_read_id,seg_id));
    }

    return std::make_pair(true,this_read_id);
}
Esempio n. 21
0
/// scan read record (and optionally its mate record) for SV evidence.
//
/// note that estimation is improved by the mate record (because we have the mate cigar string in this case)
///
static
void
getReadBreakendsImpl(
    const ReadScannerOptions& opt,
    const ReadScannerDerivOptions& dopt,
    const SVLocusScanner::CachedReadGroupStats& rstats,
    const bam_record& localRead,
    const bam_record* remoteReadPtr,
    const bam_header_info& bamHeader,
    const reference_contig_segment& localRefSeq,
    const reference_contig_segment* remoteRefSeqPtr,
    std::vector<SVObservation>& candidates,
    known_pos_range2& localEvidenceRange)
{
    using namespace illumina::common;

#ifdef DEBUG_SCANNER
    log_os << __FUNCTION__ << ": Starting read: " << localRead.qname() << "\n";
#endif

    const chromMap_t& chromToIndex(bamHeader.chrom_to_index);

    candidates.clear();

    /// get some basic derived information from the bam_record:
    const SimpleAlignment localAlign(getAlignment(localRead));

    try
    {
        getSingleReadSVCandidates(opt, dopt, localRead, localAlign, chromToIndex,
                                  localRefSeq, candidates);

        // run the same check on the read's mate if we have access to it
        if (nullptr != remoteReadPtr)
        {
            const bam_record& remoteRead(*remoteReadPtr);
            const SimpleAlignment remoteAlign(getAlignment(remoteRead));

            if (nullptr == remoteRefSeqPtr)
            {
                static const char msg[] = "ERROR: remoteRefSeqPtr cannot be null";
                BOOST_THROW_EXCEPTION(LogicException(msg));
            }
            getSingleReadSVCandidates(opt, dopt, remoteRead, remoteAlign,
                                      chromToIndex, (*remoteRefSeqPtr),
                                      candidates);
        }

        // process shadows:
        //getSVCandidatesFromShadow(opt, rstats, localRead, localAlign,remoteReadPtr,candidates);

        // - process anomalous read pairs:
        getSVCandidatesFromPair(opt, dopt, rstats, localRead, localAlign, remoteReadPtr,
                                candidates);
    }
    catch (...)
    {
        std::cerr << "ERROR: Exception caught while processing ";
        if (nullptr == remoteReadPtr)
        {
            std::cerr << "single read record:\n"
                      << '\t' << localRead << "\n";
        }
        else
        {
            std::cerr << " read pair records:\n"
                      << '\t'  << localRead << "\n"
                      << '\t' << (*remoteReadPtr) << "\n";
        }
        throw;
    }

#ifdef DEBUG_SCANNER
    log_os << __FUNCTION__ << ": post-pair candidate_size: " << candidates.size() << "\n";
#endif

    // update localEvidence range:
    // note this is only used if candidates were added, so there's no harm in setting it every time:
    const unsigned localRefLength(apath_ref_length(localAlign.path));
    const pos_t startRefPos(localRead.pos()-1);
    const pos_t endRefPos(startRefPos+localRefLength);

    localEvidenceRange.set_range(startRefPos,endRefPos);

    const int maxTid(chromToIndex.size());

    /// final chance to QC candidate set:
    ///
    for (const SVCandidate& sv : candidates)
    {
        bool isInvalidTid(false);
        if ((sv.bp1.interval.tid < 0) || (sv.bp1.interval.tid >= maxTid))
        {
            isInvalidTid=true;
        }
        else if (sv.bp2.state != SVBreakendState::UNKNOWN)
        {
            if ((sv.bp2.interval.tid < 0) || (sv.bp2.interval.tid >= maxTid))
            {
                isInvalidTid=true;
            }
        }

        bool isInvalidPos(false);
        if (! isInvalidTid)
        {
            // note in the 'off-chromosome edge' test below we check for cases which are obviously way off
            // the edge, but allow for a bit of over-edge mistakes to occur for the circular chromosomes
            //
            static const int offEdgePad(500);
            const pos_t tid1Length(bamHeader.chrom_data[sv.bp1.interval.tid].length);
            if ((sv.bp1.interval.range.end_pos() <= -offEdgePad) || (sv.bp1.interval.range.begin_pos() >= (tid1Length+offEdgePad)))
            {
                isInvalidPos=true;
            }
            else if (sv.bp2.state != SVBreakendState::UNKNOWN)
            {
                const pos_t tid2Length(bamHeader.chrom_data[sv.bp2.interval.tid].length);
                if ((sv.bp2.interval.range.end_pos() <= -offEdgePad) || (sv.bp2.interval.range.begin_pos() >= (tid2Length+offEdgePad)))
                {
                    isInvalidPos=true;
                }
            }
        }

        if (isInvalidTid || isInvalidPos)
        {
            std::ostringstream oss;
            if (isInvalidTid)
            {
                oss << "SVbreakend has unknown or invalid chromosome id in candidate sv.\n";
            }
            else
            {
                oss << "Cannot interpret BAM record: candidate SV breakend from BAM record is off chromosome edge.\n";
            }

            oss << "\tlocal_bam_record: " <<  localRead << "\n"
                << "\tremote_bam record: ";
            if (NULL==remoteReadPtr)
            {
                oss << "NONE";
            }
            else
            {
                oss << (*remoteReadPtr);
            }
            oss << "\n"
                << "\tSVCandidate: " << sv << "\n";
            BOOST_THROW_EXCEPTION(LogicException(oss.str()));
        }
    }
}
Esempio n. 22
0
/// Create an SVLocus for each potential SV event supported by the BAM record
///
/// the loci count should almost always be one (or, depending on input filtration, zero).
/// multiple suggested loci from one read is more of a theoretical possibility than an
/// expectation.
///
static
void
getSVLociImpl(
    const ReadScannerOptions& opt,
    const ReadScannerDerivOptions& dopt,
    const SVLocusScanner::CachedReadGroupStats& rstats,
    const bam_record& bamRead,
    const bam_header_info& bamHeader,
    const reference_contig_segment& refSeq,
    std::vector<SVLocus>& loci,
    SampleEvidenceCounts& eCounts)
{
    using namespace illumina::common;

    loci.clear();
    std::vector<SVObservation> candidates;
    known_pos_range2 localEvidenceRange;

    getReadBreakendsImpl(opt, dopt, rstats, bamRead, nullptr, bamHeader,
                         refSeq, nullptr, candidates, localEvidenceRange);

#ifdef DEBUG_SCANNER
    log_os << __FUNCTION__ << ": candidate_size: " << candidates.size() << "\n";
#endif

    // translate SVCandidate to a simpler form for use
    // in the SV locus graph:
    for (const SVCandidate& cand : candidates)
    {
        const bool isCandComplex(isComplexSV(cand));

        const SVBreakend& localBreakend(cand.bp1);
        const SVBreakend& remoteBreakend(cand.bp2);

        if ((0==localBreakend.interval.range.size()) ||
            ((! isCandComplex) && (0==remoteBreakend.interval.range.size())))
        {
            std::ostringstream oss;
            oss << "Unexpected breakend pattern proposed from bam record.\n"
                << "\tlocal_breakend: " << localBreakend << "\n"
                << "\tremote_breakend: " << remoteBreakend << "\n"
                << "\tbam_record: " << bamRead << "\n";
            BOOST_THROW_EXCEPTION(LogicException(oss.str()));
        }

        // update evidence stats:
        for (int i(0); i< SVEvidenceType::SIZE; ++i)
        {
            eCounts.eType[i] += localBreakend.lowresEvidence.getVal(i);
        }

        // determine the evidence weight of this candidate:
        unsigned localEvidenceWeight(0);
        unsigned remoteEvidenceWeight(0);

        if (localBreakend.getAnyNonPairCount() != 0)
        {
            localEvidenceWeight = SVObservationWeights::internalReadEvent;
            if (remoteBreakend.getAnyNonPairCount() != 0)
            {
                remoteEvidenceWeight = SVObservationWeights::internalReadEvent;
            }
        }
        else if (localBreakend.getLocalPairCount() != 0)
        {
            bool isClose(false);
            if (is_innie_pair(bamRead))
            {
                isClose = (std::abs(bamRead.template_size()) < rstats.minDistantFragmentSize);
            }

            unsigned thisWeight(SVObservationWeights::readPair);
            if (isClose)
            {
                thisWeight = SVObservationWeights::closeReadPair;
                eCounts.closeCount += 1;
            }

            localEvidenceWeight = thisWeight;
            if (remoteBreakend.getLocalPairCount() != 0)
            {
                remoteEvidenceWeight = thisWeight;
            }
        }

        // finally, create the graph locus:
        SVLocus locus;
        // set local breakend estimate:
        const NodeIndexType localBreakendNode(locus.addNode(localBreakend.interval));
        locus.setNodeEvidence(localBreakendNode,localEvidenceRange);

        if (isCandComplex)
        {
            locus.linkNodes(localBreakendNode,localBreakendNode,localEvidenceWeight);
        }
        else
        {
            // set remote breakend estimate:
            const NodeIndexType remoteBreakendNode(locus.addNode(remoteBreakend.interval));
            locus.linkNodes(localBreakendNode,remoteBreakendNode,localEvidenceWeight,remoteEvidenceWeight);

            locus.mergeSelfOverlap();
        }

#ifdef DEBUG_SCANNER
        log_os << __FUNCTION__ << ": adding Locus: " << locus << "\n";
#endif
        loci.push_back(locus);
    }
}