Пример #1
0
hap_cand::
hap_cand(const bam_seq_base& read_seq,
         const uint8_t* init_qual,
         const int offset)  // the offset into read of the pileup base
    : _total_qual(0)
{
    const int read_len(read_seq.size());
    assert((offset>=0) && (offset<read_len));

    int start(offset-FLANK_SIZE);
    int end(offset+FLANK_SIZE+1);
    const int pre_seq( (start<0) ? -start : 0 );
    const int post_seq( (end>read_len) ? (end-read_len) : 0 );
    start=std::max(start,0);
    end=std::min(end,read_len);

    for (int i(0); i<pre_seq; ++i) { _bq[i] = 0; }

    for (int i(start); i<end; ++i) {
        _total_qual += init_qual[i];
        const char rs(read_seq.get_char(i));
        _bq[i-start+pre_seq] =
            ( (rs=='N') ?
              0 :
              (init_qual[i]<<QUAL_SHIFT | base_to_id(rs)));
    }

    for (int i(0); i<post_seq; ++i) { _bq[i+end-start+pre_seq] = 0; }
}
// score a contiguous matching alignment segment
//
// note that running the lnp value through as a reference creates more
// floating point stability for ambiguous alignments which have the
// same score by definition.
//
static
void
score_segment(const starling_options& /*opt*/,
              const unsigned seg_length,
              const bam_seq_base& seq,
              const uint8_t* qual,
              const unsigned read_offset,
              const bam_seq_base& ref,
              const pos_t ref_head_pos,
              double& lnp) {

    static const double lnthird(-std::log(3.));

    for (unsigned i(0); i<seg_length; ++i) {
        const pos_t readi(static_cast<pos_t>(read_offset+i));
        const uint8_t sbase(seq.get_code(readi));
        if (sbase == BAM_BASE::ANY) continue;
        const uint8_t qscore(qual[readi]);
        bool is_ref(sbase == BAM_BASE::REF);
        if (! is_ref) {
            const pos_t refi(ref_head_pos+static_cast<pos_t>(i));
            is_ref=(sbase == ref.get_code(refi));
        }
        lnp += ( is_ref ?
                 qphred_to_ln_comp_error_prob(qscore) :
                 qphred_to_ln_error_prob(qscore)+lnthird );
    }
}
// Handle the regular ol' insertions and deletions. Reports these
// types as breakpoints when they're too long:
//
static
void
process_simple_indel(const unsigned max_indel_size,
                     const ALIGNPATH::path_t& path,
                     const bam_seq_base& bseq,
                     starling_pos_processor_base& sppr,
                     indel_observation& obs,
                     const unsigned sample_no,
                     const unsigned path_index,
                     const unsigned read_offset,
                     const pos_t ref_head_pos)
{
    using namespace ALIGNPATH;

    const path_segment& ps(path[path_index]);

    // large insertion breakpoints are not filtered as noise:
    if(obs.data.is_noise) {
        if((ps.type == INSERT) &&
           (ps.length > max_cand_filter_insert_size)) {
            obs.data.is_noise=false;
        }
    }

    if(ps.length <= max_indel_size) {
        obs.key.pos=ref_head_pos;
        obs.key.length = ps.length;
        if(ps.type == INSERT) {
            obs.key.type=INDEL::INSERT;
            bam_seq_to_str(bseq,read_offset,read_offset+ps.length,obs.data.insert_seq);
        } else {
            obs.key.type=INDEL::DELETE;
        }
        finish_indel_sppr(obs,sppr,sample_no);
    } else {
        // left side BP:
        {
            obs.key.pos=ref_head_pos;
            obs.key.length=ps.length;
            obs.key.type=INDEL::BP_LEFT;
            const unsigned start(read_offset);
            const unsigned size(bseq.size()-read_offset);
            const unsigned end(start+std::min(size,max_indel_size));
            bam_seq_to_str(bseq,start,end,obs.data.insert_seq);
            finish_indel_sppr(obs,sppr,sample_no);
        }
        // right side BP:
        {
            obs.key.pos=ref_head_pos;
            if(ps.type == DELETE) obs.key.pos+=ps.length;
            obs.key.length=ps.length;
            obs.key.type=INDEL::BP_RIGHT;

            const unsigned next_read_offset(read_offset+((ps.type==INSERT) ? ps.length : 0));
            const unsigned start_offset(next_read_offset-std::min(next_read_offset,max_indel_size));
            bam_seq_to_str(bseq,start_offset,next_read_offset,obs.data.insert_seq);
            finish_indel_sppr(obs,sppr,sample_no);
        }
    }
}
// Note that unlike other indel processors, the swap processor returns
// the number of segments consumed.
//
// Like regular indels, a swap will be reported as a breakpoint if
// it's too long (meaning longeset of insert,delete size).
//
static
unsigned
process_swap(const unsigned max_indel_size,
             const ALIGNPATH::path_t& path,
             const bam_seq_base& bseq,
             starling_pos_processor_base& sppr,
             indel_observation& obs,
             const unsigned sample_no,
             const unsigned path_index,
             const unsigned read_offset,
             const pos_t ref_head_pos)
{
    using namespace ALIGNPATH;

    const swap_info sinfo(path,path_index);
    const unsigned swap_size(std::max(sinfo.insert_length,sinfo.delete_length));

    // large insertions are not filtered as noise:
    if(obs.data.is_noise) {
        if(sinfo.insert_length > max_cand_filter_insert_size) {
            obs.data.is_noise=false;
        }
    }

    if(swap_size <= max_indel_size) {
        obs.key.pos=ref_head_pos;
        obs.key.length=sinfo.insert_length;
        obs.key.swap_dlength=sinfo.delete_length;
        obs.key.type = INDEL::SWAP;
        bam_seq_to_str(bseq,read_offset,read_offset+sinfo.insert_length,obs.data.insert_seq);
        finish_indel_sppr(obs,sppr,sample_no);

    } else {

        // left side BP:
        {
            obs.key.pos=ref_head_pos;
            obs.key.length=swap_size;
            obs.key.type=INDEL::BP_LEFT;
            const unsigned start(read_offset);
            const unsigned size(bseq.size()-read_offset);
            const unsigned end(start+std::min(size,max_indel_size));
            bam_seq_to_str(bseq,start,end,obs.data.insert_seq);
            finish_indel_sppr(obs,sppr,sample_no);
        }

        // right side BP:
        {
            obs.key.pos=ref_head_pos+sinfo.delete_length;
            obs.key.length=swap_size;
            obs.key.type=INDEL::BP_RIGHT;
            const unsigned next_read_offset(read_offset+sinfo.insert_length);
            const unsigned start_offset(next_read_offset-std::min(next_read_offset,max_indel_size));
            bam_seq_to_str(bseq,start_offset,next_read_offset,obs.data.insert_seq);
            finish_indel_sppr(obs,sppr,sample_no);
        }
    }

    return sinfo.n_seg;
}
static
void
bam_seq_to_str(const bam_seq_base& bs,
               const unsigned start,
               const unsigned end,
               std::string& s) {
    s.clear();
    for(unsigned i(start); i<end; ++i) s.push_back(bs.get_char(i));
}
// Extract indel information from an alignment and store this
// in the starling_pos_processor indel buffer.
//
// assumes that path is already validated for seq!!!
//
void
add_alignment_indels_to_sppr(const unsigned max_indel_size,
                             const reference_contig_segment& ref,
                             const alignment& al,
                             const bam_seq_base& read_seq,
                             starling_pos_processor_base& sppr,
                             const INDEL_ALIGN_TYPE::index_t iat,
                             const align_id_t id,
                             const unsigned sample_no,
                             const std::pair<bool,bool>& edge_pin,
                             const indel_set_t* edge_indel_ptr) {

    using namespace ALIGNPATH;

    const unsigned seq_len(read_seq.size());

    if(is_apath_invalid(al.path,seq_len)) {
        std::ostringstream oss;
        oss << "ERROR: Can't handle alignment path '" << apath_to_cigar(al.path) << "' -- " << get_apath_invalid_reason(al.path,seq_len) << "\n";
        throw blt_exception(oss.str().c_str());
    }

    if(is_apath_starling_invalid(al.path)) {
        std::ostringstream oss;
        oss << "ERROR: can't handle alignment path '" << apath_to_cigar(al.path) << "'\n";
        throw blt_exception(oss.str().c_str());
    }

    const rc_segment_bam_seq ref_bseq(ref);

    const std::pair<unsigned,unsigned> ends(get_match_edge_segments(al.path));

    pos_range valid_pr;
    get_valid_alignment_range(al,ref_bseq,read_seq,valid_pr);

    unsigned path_index(0);
    unsigned read_offset(0);
    pos_t ref_head_pos(al.pos);

    const unsigned aps(al.path.size());
    while(path_index<aps) {
        const path_segment& ps(al.path[path_index]);
        const bool is_begin_edge(path_index<ends.first);
        const bool is_end_edge(path_index>ends.second);
        const bool is_edge_segment(is_begin_edge || is_end_edge);

        const bool is_swap_start(is_segment_swap_start(al.path,path_index));

        assert(ps.type != SKIP);
        assert(! (is_edge_segment && is_swap_start));

        indel_observation obs;
        obs.data.iat = iat;
        obs.data.id = id;

        if(MATCH != ps.type) {
            pos_range indel_read_pr;
            indel_read_pr.set_begin_pos((read_offset==0) ? 0 : (read_offset-1));

            unsigned rlen(0);
            if       (is_swap_start) {
                const swap_info sinfo(al.path,path_index);
                rlen=sinfo.insert_length;
            } else if(is_segment_type_read_length(ps.type)) {
                rlen=ps.length;
            }
            indel_read_pr.set_end_pos(std::min(seq_len,read_offset+1+rlen));
            if(! valid_pr.is_superset_of(indel_read_pr)) obs.data.is_noise=true;
        }

        unsigned n_seg(1); // number of path segments consumed
        if(is_edge_segment) {
            // is this indel occurring on a pinned edge (ie against an exon?)
            const bool is_pinned_indel((is_begin_edge && edge_pin.first) ||
                                       (is_end_edge && edge_pin.second));

            // edge inserts are allowed for intron adjacent and grouper reads, edge deletions for intron adjacent only:
            if(ps.type == INSERT) {
                process_edge_insert(max_indel_size,al.path,read_seq,
                                    sppr,obs,sample_no,edge_indel_ptr,
                                    seq_len,ends,path_index,read_offset,ref_head_pos,
                                    is_pinned_indel);
            } else if(ps.type == DELETE) {
                if(is_pinned_indel) {
                    process_edge_delete(max_indel_size,al.path,read_seq,
                                        sppr,obs,sample_no,
                                        path_index,read_offset,ref_head_pos,
                                        is_pinned_indel);
                }
            }
        } else if(is_swap_start) {
            n_seg = process_swap(max_indel_size,al.path,read_seq,
                                 sppr,obs,sample_no,
                                 path_index,read_offset,ref_head_pos);

        } else if(is_segment_type_indel(al.path[path_index].type)) {
            process_simple_indel(max_indel_size,al.path,read_seq,
                                 sppr,obs,sample_no,
                                 path_index,read_offset,ref_head_pos);

        }

        for(unsigned i(0); i<n_seg; ++i) { increment_path(al.path,path_index,read_offset,ref_head_pos); }
    }
}