Пример #1
0
    pair<Alignment, Alignment> Filter::orientation_filter(Alignment& aln_first, Alignment& aln_second){

        bool f_rev = false;
        bool s_rev = false;
        Path f_path = aln_first.path();
        Path s_path = aln_second.path();
        for (int i = 0; i < f_path.mapping_size(); i++){
            if (f_path.mapping(i).position().is_reverse()){
                f_rev = true;
            }
        }

        for (int j = 0; j < s_path.mapping_size(); j++){
            if (s_path.mapping(j).position().is_reverse()){
                s_rev = true;
            }
        }



        if (!s_rev != !f_rev){
            return inverse ? std::make_pair(aln_first, aln_second) : std::make_pair(Alignment(), Alignment());
        }
        else{
            return inverse ? std::make_pair(Alignment(), Alignment()) : std::make_pair(aln_first, aln_second);
        }

    }
Пример #2
0
    Alignment Filter::soft_clip_filter(Alignment& aln){
        //Find overhangs - portions of the read that
        // are inserted at the ends.
        if (aln.path().mapping_size() > 0){
            Path path = aln.path();
            Edit left_edit = path.mapping(0).edit(0);
            Edit right_edit = path.mapping(path.mapping_size() - 1).edit(path.mapping(path.mapping_size() - 1).edit_size() - 1);
            int left_overhang = left_edit.to_length() - left_edit.from_length();
            int right_overhang = right_edit.to_length() - right_edit.from_length();
            if (left_overhang > soft_clip_limit || right_overhang > soft_clip_limit){
                return inverse ? Alignment() : aln;
            }
            else{
                return inverse ?  aln : Alignment();
            }
        }
        else{
            if (aln.sequence().length() > soft_clip_limit){
                return inverse ? Alignment() : aln;
            }
            cerr << "WARNING: SHORT ALIGNMENT: " << aln.sequence().size() << "bp" << endl
                << "WITH NO MAPPINGS TO REFERENCE" << endl
                << "CONSIDER REMOVING IT FROM ANALYSIS" << endl;
            return inverse ? Alignment() : aln;
        }

    }
Пример #3
0
Alignment Sampler::mutate(const Alignment& aln,
                          double base_error,
                          double indel_error) {

    if (base_error == 0 && indel_error == 0) return aln;

    string bases = "ATGC";
    uniform_real_distribution<double> rprob(0, 1);
    uniform_int_distribution<int> rbase(0, 3);

    Alignment mutaln;
    for (size_t i = 0; i < aln.path().mapping_size(); ++i) {
        auto& orig_mapping = aln.path().mapping(i);
        Mapping* new_mapping = mutaln.mutable_path()->add_mapping();
        *new_mapping->mutable_position() = orig_mapping.position();
        // for each edit in the mapping
        for (size_t j = 0; j < orig_mapping.edit_size(); ++j) {
            auto& orig_edit = orig_mapping.edit(j);
            auto new_edits = mutate_edit(orig_edit, make_pos_t(orig_mapping.position()),
                                         base_error, indel_error,
                                         bases, rprob, rbase);
            for (auto& edit : new_edits) {
                *new_mapping->add_edit() = edit;
            }
        }
    }
    // re-derive the alignment's sequence
    mutaln = simplify(mutaln);
    mutaln.set_sequence(alignment_seq(mutaln));
    mutaln.set_name(aln.name());
    return mutaln;
}
Пример #4
0
void Pileups::compute_from_alignment(VG& graph, Alignment& alignment) {
    // if we start reversed
    if (alignment.has_path() && alignment.path().mapping(0).position().is_reverse()) {
        alignment = reverse_alignment(alignment,
                                      (function<int64_t(int64_t)>) ([&graph](int64_t id) {
                                          return graph.get_node(id)->sequence().size();
                                          }));
    }
    const Path& path = alignment.path();
    int64_t read_offset = 0;
    for (int i = 0; i < path.mapping_size(); ++i) {
        const Mapping& mapping = path.mapping(i);
        if (graph.has_node(mapping.position().node_id())) {
            const Node* node = graph.get_node(mapping.position().node_id());
            NodePileup* pileup = get_create(node->id());
            int64_t node_offset = mapping.position().offset();
            for (int j = 0; j < mapping.edit_size(); ++j) {
                const Edit& edit = mapping.edit(j);
                // process all pileups in edit.
                // update the offsets as we go
                compute_from_edit(*pileup, node_offset, read_offset, *node,
                                  alignment, mapping, edit);
            }
        }
    }
    assert(alignment.sequence().empty() ||
           alignment.path().mapping_size() == 0 ||
           read_offset == alignment.sequence().length());
}
Пример #5
0
 pair<Alignment, Alignment> Filter::interchromosomal_filter(Alignment& aln_first, Alignment& aln_second){
     if (aln_first.path().name() != aln_second.path().name()){
         return std::make_pair(aln_first, aln_second);
     }
     else{
         return std::make_pair(Alignment(), Alignment());
     }
 }
Пример #6
0
Alignment merge_alignments(const Alignment& a1, const Alignment& a2, bool debug) {
    //cerr << "overlap is " << overlap << endl;
    // if either doesn't have a path, then treat it like a massive softclip
    if (debug) cerr << "merging alignments " << endl << pb2json(a1) << endl << pb2json(a2) << endl;
    // concatenate them
    Alignment a3;
    a3.set_sequence(a1.sequence() + a2.sequence());
    *a3.mutable_path() = concat_paths(a1.path(), a2.path());
    if (debug) cerr << "merged alignments, result is " << endl << pb2json(a3) << endl;
    return a3;
}
Пример #7
0
string Sampler::alignment_seq(const Alignment& aln) {
    // get the graph corresponding to the alignment path
    Graph sub;
    for (int i = 0; i < aln.path().mapping_size(); ++ i) {
        auto& m = aln.path().mapping(i);
        if (m.has_position() && m.position().node_id()) {
            auto id = aln.path().mapping(i).position().node_id();
            xgidx->neighborhood(id, 2, sub);
        }
    }
    VG g; g.extend(sub);
    return g.path_string(aln.path());
}
Пример #8
0
    /**
     * Filter reads that are less than <PCTID> reference.
     * I.E. if a read matches the reference along 80% of its
     * length, and your cutoff is 90% PCTID, throw it out.
     */
    Alignment Filter::percent_identity_filter(Alignment& aln){
        double read_pctid = 0.0;
        //read pct_id = len(matching sequence / len(total sequence)

        int64_t aln_total_len = aln.sequence().size();
        int64_t aln_match_len = 0;

        std::function<double(int64_t, int64_t)> calc_pct_id = [](int64_t rp, int64_t ttlp){
            return ((double) rp / (double) ttlp);
        };



        Path path = aln.path();
        //TODO handle reversing mappings

        for (int i = 0; i < path.mapping_size(); i++){
            Mapping mapping = path.mapping(i);

            for (int j = 0; j < mapping.edit_size(); j++){
                Edit ee = mapping.edit(j);
                if (ee.from_length() == ee.to_length() && ee.sequence() == ""){
                    aln_match_len += ee.to_length();
                }

            }
        }
        if (calc_pct_id(aln_match_len, aln_total_len) < min_percent_identity){
            return inverse ? aln : Alignment();
        }

        return inverse ? Alignment() : aln;


    }
Пример #9
0
int alignment_from_length(const Alignment& a) {
    int l = 0;
    for (const auto& m : a.path().mapping()) {
        l += from_length(m);
    }
    return l;
}
Пример #10
0
Alignment Sampler::alignment_with_error(size_t length,
                                        double base_error,
                                        double indel_error) {
    size_t maxiter = 100;
    Alignment aln;
    if (base_error > 0 || indel_error > 0) {
        // sample a longer-than necessary alignment, then trim
        size_t iter = 0;
        while (iter++ < maxiter) {
            aln = mutate(
                alignment(length + 2 * ((double) length * indel_error)),
                base_error, indel_error);
            if (aln.sequence().size() == length) {
                break;
            } else if (aln.sequence().size() > length) {
                aln = strip_from_end(aln, aln.sequence().size() - length);
                break;
            }
        }
        if (iter == maxiter) {
            cerr << "[vg::Sampler] Warning: could not generate alignment of sufficient length. "
                 << "Graph may be too small, or indel rate too high." << endl;
        }
    } else {
        aln = alignment(length);
    }
    aln.set_identity(identity(aln.path()));
    return aln;
}
Пример #11
0
string alignment_to_sam(const Alignment& alignment,
                        const string& refseq,
                        const int32_t refpos,
                        const string& cigar,
                        const string& mateseq,
                        const int32_t matepos,
                        const int32_t tlen) {
    stringstream sam;

    sam << (!alignment.name().empty() ? alignment.name() : "*") << "\t"
        << sam_flag(alignment) << "\t"
        << (refseq.empty() ? "*" : refseq) << "\t"
        << refpos + 1 << "\t"
        //<< (alignment.path().mapping_size() ? refpos + 1 : 0) << "\t" // positions are 1-based in SAM, 0 means unmapped
        << alignment.mapping_quality() << "\t"
        << (alignment.has_path() && alignment.path().mapping_size() ? cigar : "*") << "\t"
        << (mateseq == refseq ? "=" : mateseq) << "\t"
        << matepos + 1 << "\t"
        << tlen << "\t"
        << (!alignment.sequence().empty() ? alignment.sequence() : "*") << "\t";
    // hack much?
    if (!alignment.quality().empty()) {
        const string& quality = alignment.quality();
        for (int i = 0; i < quality.size(); ++i) {
            sam << quality_short_to_char(quality[i]);
        }
    } else {
        sam << "*";
        //sam << string(alignment.sequence().size(), 'I');
    }
    //<< (alignment.has_quality() ? string_quality_short_to_char(alignment.quality()) : string(alignment.sequence().size(), 'I'));
    if (!alignment.read_group().empty()) sam << "\tRG:Z:" << alignment.read_group();
    sam << "\n";
    return sam.str();
}
Пример #12
0
 Alignment Filter::interchromosomal_filter(Alignment& aln){
     bool fails = aln.path().name() != aln.fragment_prev().path().name();
     if (fails){
         return inverse ? Alignment() : aln;
     }
     else{
         return inverse ? aln : Alignment();
     }
 }
Пример #13
0
 pair<Alignment, Alignment> Filter::insert_size_filter(Alignment& aln_first, Alignment& aln_second){
     // TODO: gret positions from aln_first and aln_second
     int distance = my_xg_index->approx_path_distance(aln_first.path().name(), 1, 1);
     if (distance > my_max_distance){
         return std::make_pair(aln_first, aln_second);
     }
     else{
         return std::make_pair(Alignment(), Alignment());
     }
 }
Пример #14
0
// act like the path this is against is the reference
// and generate an equivalent cigar
string cigar_against_path(const Alignment& alignment) {
    vector<pair<int, char> > cigar;
    if (!alignment.has_path()) return "";
    const Path& path = alignment.path();
    int l = 0;
    for (const auto& mapping : path.mapping()) {
        mapping_cigar(mapping, cigar);
    }
    return cigar_string(cigar);
}
Пример #15
0
 /* PE functions using fragment_prev and fragment_next */
 Alignment Filter::one_end_anchored_filter(Alignment& aln){
     if (aln.fragment_prev().name() != ""){
         if (aln.path().name() == "" || aln.fragment_prev().path().name() == ""){
             inverse ? Alignment() : aln;
         }
         else{
             inverse ? aln : Alignment();
         }
     }
     else{
         return inverse ? aln : Alignment();
     }
 }
Пример #16
0
vector<int> Vectorizer::alignment_to_a_hot(Alignment a){
    int64_t entity_size = my_xg->node_count + my_xg->edge_count;
    vector<int> ret(entity_size, 0);
    Path path = a.path();
    for (int i = 0; i < path.mapping_size(); i++){
        Mapping mapping = path.mapping(i);
        if(! mapping.has_position()){
            continue;
        }
        Position pos = mapping.position();
        int64_t node_id = pos.node_id();
        int64_t key = my_xg->node_rank_as_entity(node_id);
        // Okay, solved the previous out of range errors:
        // We have to use an entity-space that is |nodes + edges + 1|
        // as nodes are indexed from 1, not from 0.
        //TODO: this means we may one day have to do the same bump up
        // by one for edges, as I assume they are also indexed starting at 1.
        //cerr << key << " - " << entity_size << endl;

        //Find edge by current / previous node ID
        // we can check the orientation, though it shouldn't **really** matter
        // whether we catch them in the forward or reverse direction.
        if (i > 0){
            Mapping prev_mapping = path.mapping(i - 1);
            Position prev_pos = prev_mapping.position();
            int64_t prev_node_id = prev_pos.node_id();
            if (my_xg->has_edge(prev_node_id, false, node_id, false)){
                int64_t edge_key = my_xg->edge_rank_as_entity(prev_node_id, false, node_id, false);
                vector<size_t> edge_paths = my_xg->paths_of_entity(edge_key);
                if (edge_paths.size() > 0){
                    ret[edge_key - 1] = 1;
                }
                else{
                    ret[edge_key - 1] = 2;
                }
            }
        }
        //Check if the node of interest is on a path
        vector<size_t> node_paths = my_xg->paths_of_node(node_id);
        if (node_paths.size() > 0){
            ret[key - 1] = 2;
        }
        else{
            ret[key - 1] = 1;
        }

    }

    return ret;

}
Пример #17
0
vector<double> Vectorizer::alignment_to_identity_hot(Alignment a){
    int64_t entity_size = my_xg->node_count + my_xg->edge_count;
    vector<double> ret(entity_size, 0.0);

    Path path = a.path();
    for (int i = 0; i < path.mapping_size(); i ++){
        Mapping mapping = path.mapping(i);
        if(! mapping.has_position()){
            continue;
        }
        Position pos = mapping.position();
        int64_t node_id = pos.node_id();
        int64_t key = my_xg->node_rank_as_entity(node_id);

        //Calculate % identity by walking the edits and counting matches.
        double pct_id = 0.0;
        double match_len = 0.0;
        double total_len = 0.0;

        for (int j = 0; j < mapping.edit_size(); j++){
            Edit e = mapping.edit(j);
            total_len += e.from_length();
            if (e.from_length() == e.to_length() && e.sequence() == ""){
                match_len += (double) e.to_length();
            }
            else if (e.from_length() == e.to_length() && e.sequence() != ""){
                // TODO if we map but don't match exactly, add half the average length to match_length
                //match_len += (double) (0.5 * ((double) e.to_length()));
            }
            else{
                
            }
            
        }
        pct_id = (match_len == 0.0 && total_len == 0.0) ? 0.0 : (match_len / total_len);
        ret[key - 1] = pct_id;

        if (i > 0){
            Mapping prev_mapping = path.mapping(i - 1);
            Position prev_pos = prev_mapping.position();
            int64_t prev_node_id = prev_pos.node_id();
            if (my_xg->has_edge(prev_node_id, false, node_id, false)){
                int64_t edge_key = my_xg->edge_rank_as_entity(prev_node_id, false, node_id, false);
                ret[edge_key - 1] = 1.0;
            }
        }
    }
    return ret;
}
Пример #18
0
// generates a perfect alignment from the graph
Alignment Sampler::alignment(size_t length) {
    string seq;
    Alignment aln;
    Path* path = aln.mutable_path();
    pos_t pos = position();
    char c = pos_char(pos);
    // we do something wildly inefficient but conceptually clean
    // for each position in the mapping we add a mapping
    // at the end we will simplify the alignment, merging redundant mappings
    do {
        // add in the char for the current position
        seq += c;
        Mapping* mapping = path->add_mapping();
        *mapping->mutable_position() = make_position(pos);
        Edit* edit = mapping->add_edit();
        edit->set_from_length(1);
        edit->set_to_length(1);
        // decide the next position
        auto nextc = next_pos_chars(pos);
        // no new positions mean we are done; we've reached the end of the graph
        if (nextc.empty()) break;
        // what positions do we go to next?
        vector<pos_t> nextp;
        for (auto& n : nextc) nextp.push_back(n.first);
        // pick one at random
        uniform_int_distribution<int> next_dist(0, nextc.size()-1);
        // update our position
        pos = nextp.at(next_dist(rng));
        // update our char
        c = nextc[pos];
    } while (seq.size() < length);
    // save our sequence in the alignment
    aln.set_sequence(seq);
    aln = simplify(aln);
    { // name the alignment
        string data;
        aln.SerializeToString(&data);
        int n;
#pragma omp critical(nonce)
        n = nonce++;
        data += std::to_string(n);
        const string hash = sha1head(data, 16);
        aln.set_name(hash);
    }
    // and simplify it
    aln.set_identity(identity(aln.path()));
    return aln;
}
Пример #19
0
    /**
     * Looks for alignments that change direction over their length.
     * This may happen because of:
     * 1. Mapping artifacts
     * 2. Cycles
     * 3. Highly repetitive regions
     * 4. Inversions (if you're lucky enough)
     *
     * Default behavior: if the Alignment reverses, return an empty Alignment.
     * inverse behavior: if the Alignment reverses, return the Alignment.
     */
    Alignment Filter::reversing_filter(Alignment& aln){

        Path path = aln.path();
        bool prev = false;

        for (int i = 1; i < path.mapping_size(); i++){
            Mapping mapping = path.mapping(i);
            Position pos = mapping.position();
            bool prev = path.mapping(i - 1).position().is_reverse();
            if (prev != pos.is_reverse()){
                return inverse ? aln : Alignment();
            }

        }
        return inverse ? Alignment() : aln;

    }
Пример #20
0
Alignment strip_from_start(const Alignment& aln, size_t drop) {
    if (!drop) return aln;
    Alignment res;
    res.set_name(aln.name());
    res.set_score(aln.score());
    //cerr << "drop " << drop << " from start" << endl;
    res.set_sequence(aln.sequence().substr(drop));
    if (!aln.has_path()) return res;
    *res.mutable_path() = cut_path(aln.path(), drop).second;
    assert(res.has_path());
    if (alignment_to_length(res) != res.sequence().size()) {
        cerr << "failed!!! drop from start 轰" << endl;
        cerr << pb2json(res) << endl << endl;
        assert(false);
    }
    return res;
}
Пример #21
0
    /**
     * Split reads map to two separate paths in the graph OR vastly separated non-consecutive
     * nodes in a single path.
     *
     * They're super important for detecting structural variants, so we may want to
     * filter them out or collect only split reads.
     */
    Alignment Filter::split_read_filter(Alignment& aln){

        //TODO binary search for breakpoint in read would be awesome.
        Path path = aln.path();
        //check if nodes are on same path(s)

        int top_side = path.mapping_size() - 1;
        int bottom_side = 0;

        Mapping bottom_mapping;
        Mapping top_mapping;

        string main_path = "";
        while (top_side > bottom_side){
            //main_path = path_of_node(path.mapping(bottom_side);
            //
            //Check if paths are different
            //if (divergent(node1, node2){
            //    return inverse ? aln : Alignment();
            //}
            top_mapping = path.mapping(top_side);
            bottom_mapping = path.mapping(bottom_side);
            Position top_pos = top_mapping.position();
            Position bot_pos = bottom_mapping.position();
            id_t top_id = top_pos.node_id();
            id_t bottom_id = bot_pos.node_id();

            // TODO USE THE XG
            if (abs(top_id - bottom_id) > 10){
                return inverse ? aln : Alignment();
            }

            // Check if two mappings are far apart 
            //
            // Check if a single mapping has a huge indel




            top_side--;
            bottom_side++;
        }

        return inverse ? Alignment() : aln;

    }
Пример #22
0
Alignment reverse_alignment(const Alignment& aln, const function<int64_t(int64_t)>& node_length) {
    // We're going to reverse the alignment and all its mappings.
    // TODO: should we/can we do this in place?
    
    Alignment reversed = aln;
    reversed.set_sequence(reverse_complement(aln.sequence()));
    
    if(aln.has_path()) {
    
        // Now invert the order of the mappings, and for each mapping, flip the
        // is_reverse flag. The edits within mappings also get put in reverse
        // order, get their positions corrected, and get their sequences get
        // reverse complemented.
        *reversed.mutable_path() = reverse_path(aln.path(), node_length);
    }
    
    return reversed;
}
Пример #23
0
Alignment strip_from_end(const Alignment& aln, size_t drop) {
    if (!drop) return aln;
    Alignment res;
    res.set_name(aln.name());
    res.set_score(aln.score());
    //cerr << "drop " << drop << " from end" << endl;
    size_t cut_at = aln.sequence().size()-drop;
    //cerr << "Cut at " << cut_at << endl;
    res.set_sequence(aln.sequence().substr(0, cut_at));
    if (!aln.has_path()) return res;
    *res.mutable_path() = cut_path(aln.path(), cut_at).first;
    assert(res.has_path());
    if (alignment_to_length(res) != res.sequence().size()) {
        cerr << "failed!!! drop from end 轰" << endl;
        cerr << pb2json(res) << endl << endl;
        assert(false);
    }
    return res;
}
Пример #24
0
int32_t sam_flag(const Alignment& alignment) {
    int16_t flag = 0;

    if (alignment.score() == 0) {
        // unmapped
        flag |= BAM_FUNMAP;
    } else {
        // correctly aligned
        flag |= BAM_FPROPER_PAIR;
    }
    // HACKZ -- you can't determine orientation from a single part of the mapping
    // unless the graph is a DAG
    if (alignment.has_path()
        && alignment.path().mapping(0).position().is_reverse()) {
        flag |= BAM_FREVERSE;
    }
    if (alignment.is_secondary()) {
        flag |= BAM_FSECONDARY;
    }
    return flag;
}
Пример #25
0
    /**
     *
     * Looks for alignments that transition from one path to another
     * over their length. This may occur for one of several reasons:
     * 1. The read covers a translocation
     * 2. The read looks a lot like two different (but highly-similar paths)
     * 3. The read is shattered (e.g. as in chromothripsis)
     *
     * Default behavior: if the Alignment is path divergent, return an empty Alignment, else return aln
     * Inverse behavior: if the Alignment is path divergent, return aln, else return an empty Alignment
     */
    Alignment Filter::path_divergence_filter(Alignment& aln){
        Path path = aln.path();
        for (int i = 1; i < path.mapping_size(); i++){
            Mapping mapping = path.mapping(i);
            Position pos = mapping.position();
            id_t current_node = pos.node_id();
            id_t prev_node = path.mapping(i - 1).position().node_id();
            bool paths_match = false;
            vector<size_t> paths_of_prev = my_xg_index->paths_of_node(prev_node);
            for (int i = 0; i < paths_of_prev.size(); i++){
                string p_name = my_xg_index->path_name(paths_of_prev[i]);
                if (my_xg_index->path_contains_node(p_name, current_node)){
                    paths_match = true;
                }
            }
            if (!paths_match){
                return inverse ? aln : Alignment();
            }

        }
        return inverse ? Alignment() : aln;
    }
Пример #26
0
bit_vector Vectorizer::alignment_to_onehot(Alignment a){
    // Make a vector as large as the | |nodes| + |edges| | space
    // TODO handle edges
    int64_t entity_size = my_xg->node_count + my_xg->edge_count;
    bit_vector ret(entity_size, 0);
    Path path = a.path();
    for (int i = 0; i < path.mapping_size(); i++){
        Mapping mapping = path.mapping(i);
        Position pos = mapping.position();
        int64_t node_id = pos.node_id();
        int64_t key = my_xg->node_rank_as_entity(node_id);
        // Okay, solved the previous out of range errors:
        // We have to use an entity-space that is |nodes + edges + 1|
        // as nodes are indexed from 1, not from 0.
        //TODO: this means we may one day have to do the same bump up
        // by one for edges, as I assume they are also indexed starting at 1.
        //cerr << key << " - " << entity_size << endl;

        //Find edge by current / previous node ID
        // we can check the orientation, though it shouldn't **really** matter
        // whether we catch them in the forward or reverse direction.
        if (i > 0){
            Mapping prev_mapping = path.mapping(i - 1);
            Position prev_pos = prev_mapping.position();
            int64_t prev_node_id = prev_pos.node_id();
            if (my_xg->has_edge(prev_node_id, false, node_id, false)){
                int64_t edge_key = my_xg->edge_rank_as_entity(prev_node_id, false, node_id, false);
                ret[edge_key - 1] = 1;
            }
        }
        //Find entity rank of edge

        ret[key - 1] = 1;
    }

    return ret;
}
Пример #27
0
    Alignment Filter::orientation_filter(Alignment& aln){
        bool f_rev = false;
        bool s_rev = false;
        Path f_path = aln.path();
        Path s_path = aln.fragment_prev().path();
        for (int i = 0; i < f_path.mapping_size(); i++){
            if (f_path.mapping(i).position().is_reverse()){
                f_rev = true;
            }
        }

        for (int j = 0; j < s_path.mapping_size(); j++){
            if (s_path.mapping(j).position().is_reverse()){
                s_rev = true;
            }
        }

        if (f_rev & s_rev){
            return inverse ? Alignment() : aln;
        }
        else{
            return inverse ? aln : Alignment();
        }
    }
Пример #28
0
size_t from_length_before_pos(const Alignment& aln, const Position& pos) {
    return path_from_length(cut_path(aln.path(), pos).first);
}
Пример #29
0
size_t from_length_after_pos(const Alignment& aln, const Position& pos) {
    return path_from_length(cut_path(aln.path(), pos).second);
}
Пример #30
0
// merge that properly handles long indels
// assumes that alignments should line up end-to-end
Alignment merge_alignments(const vector<Alignment>& alns, bool debug) {

    if (alns.size() == 0) {
        Alignment aln;
        return aln;
    } else if (alns.size() == 1) {
        return alns.front();
    }

    // where possible get node and target lengths
    // to validate after merge
    /*
    map<int64_t, map<size_t, set<const Alignment*> > > node_lengths;
    map<int64_t, map<size_t, set<const Alignment*> > > to_lengths;
    for (auto& aln : alns) {
        auto& path = aln.path();
        // find a mapping that overlaps the whole node
        // note that edits aren't simplified
        // so deletions are intact
        for (size_t i = 0; i < path.mapping_size(); ++i) {
            auto& m = path.mapping(i);
            if (m.position().offset() == 0) {
                // can we see if the next mapping is on the following node
                if (i < path.mapping_size()-1 && path.mapping(i+1).position().offset() == 0
                    && mapping_from_length(path.mapping(i+1)) && mapping_from_length(m)) {
                    // we cover the node, record the to_length and from_length
                    set<const Alignment*>& n = node_lengths[m.position().node_id()][from_length(m)];
                    n.insert(&aln);
                    set<const Alignment*>& t = to_lengths[m.position().node_id()][to_length(m)];
                    t.insert(&aln);
                }
            }
        }
    }
    // verify our input by checking for disagreements
    for (auto& n : node_lengths) {
        auto& node_id = n.first;
        if (n.second.size() > 1) {
            cerr << "disagreement in node lengths for " << node_id << endl;
            for (auto& l : n.second) {
                cerr << "alignments that report length of " << l.first << endl;
                for (auto& a : l.second) {
                    cerr << pb2json(*a) << endl;
                }
            }
        } else {
            //cerr << n.second.begin()->second.size() << " alignments support "
            //     << n.second.begin()->first << " as length for " << node_id << endl;
        }
    }
    */
    
    // parallel merge algorithm
    // for each generation
    // merge 0<-0+1, 1<-2+3, ...
    // until there is only one alignment
    vector<Alignment> last = alns;

    // get the alignments ready for merge
#pragma omp parallel for
    for (size_t i = 0; i < last.size(); ++i) {
        Alignment& aln = last[i];
        //cerr << "on " << i << "th aln" << endl
        //     << pb2json(aln) << endl;
        if (!aln.has_path()) {
            Mapping m;
            Edit* e = m.add_edit();
            e->set_to_length(aln.sequence().size());
            e->set_sequence(aln.sequence());
            *aln.mutable_path()->add_mapping() = m;
        }
    }

    while (last.size() > 1) {
        //cerr << "last size " << last.size() << endl;
        size_t new_count = last.size()/2;
        //cerr << "new count b4 " << new_count << endl;
        new_count += last.size() % 2; // force binary
        //cerr << "New count = " << new_count << endl;
        vector<Alignment> curr; curr.resize(new_count);
#pragma omp parallel for
        for (size_t i = 0; i < curr.size(); ++i) {
            //cerr << "merging " << 2*i << " and " << 2*i+1 << endl;
            // take a pair from the old alignments
            // merge them into this one
            if (2*i+1 < last.size()) {
                auto& a1 = last[2*i];
                auto& a2 = last[2*i+1];
                curr[i] = merge_alignments(a1, a2, debug);
                // check that the merge did the right thing
                /*
                auto& a3 = curr[i];
                for (size_t j = 0; j < a3.path().mapping_size()-1; ++j) {
                    // look up reported node length
                    // and compare to what we saw
                    // skips last mapping
                    auto& m = a3.path().mapping(j);
                    if (from_length(m) == to_length(m)
                        && m.has_position()
                        && m.position().offset()==0
                        && a3.path().mapping(j+1).has_position()
                        && a3.path().mapping(j+1).position().offset()==0) {
                        auto nl = node_lengths.find(m.position().node_id());
                        if (nl != node_lengths.end()) {
                            if (nl->second.find(from_length(m)) == nl->second.end()) {
                                cerr << "node length is not consistent for " << m.position().node_id() << endl;
                                cerr << "expected " << nl->second.begin()->first << endl;
                                cerr << "got " << from_length(m) << endl;
                                cerr << "inputs:" << endl << pb2json(a1) << endl << pb2json(a2)
                                     << endl << "output: " << endl << pb2json(a3) << endl;
                                //exit(1);
                            }
                        }
                    }
                }
                */
            } else {
                auto& a1 = last[2*i];
                //cerr << "no need to merge" << endl;
                curr[i] = a1;
            }
        }
        last = curr;
    }
    Alignment res = last.front();
    *res.mutable_path() = simplify(res.path());
    return res;
}