Пример #1
0
int softclip_start(Alignment& alignment) {
    if (alignment.mutable_path()->mapping_size() > 0) {
        Path* path = alignment.mutable_path();
        Mapping* first_mapping = path->mutable_mapping(0);
        Edit* first_edit = first_mapping->mutable_edit(0);
        if (first_edit->from_length() == 0 && first_edit->to_length() > 0) {
            return first_edit->to_length();
        }
    }
    return 0;
}
Пример #2
0
int softclip_end(Alignment& alignment) {
    if (alignment.mutable_path()->mapping_size() > 0) {
        Path* path = alignment.mutable_path();
        Mapping* last_mapping = path->mutable_mapping(path->mapping_size()-1);
        Edit* last_edit = last_mapping->mutable_edit(last_mapping->edit_size()-1);
        if (last_edit->from_length() == 0 && last_edit->to_length() > 0) {
            return last_edit->to_length();
        }
    }
    return 0;
}
Пример #3
0
Alignment Sampler::mutate(const Alignment& aln,
                          double base_error,
                          double indel_error) {

    if (base_error == 0 && indel_error == 0) return aln;

    string bases = "ATGC";
    uniform_real_distribution<double> rprob(0, 1);
    uniform_int_distribution<int> rbase(0, 3);

    Alignment mutaln;
    for (size_t i = 0; i < aln.path().mapping_size(); ++i) {
        auto& orig_mapping = aln.path().mapping(i);
        Mapping* new_mapping = mutaln.mutable_path()->add_mapping();
        *new_mapping->mutable_position() = orig_mapping.position();
        // for each edit in the mapping
        for (size_t j = 0; j < orig_mapping.edit_size(); ++j) {
            auto& orig_edit = orig_mapping.edit(j);
            auto new_edits = mutate_edit(orig_edit, make_pos_t(orig_mapping.position()),
                                         base_error, indel_error,
                                         bases, rprob, rbase);
            for (auto& edit : new_edits) {
                *new_mapping->add_edit() = edit;
            }
        }
    }
    // re-derive the alignment's sequence
    mutaln = simplify(mutaln);
    mutaln.set_sequence(alignment_seq(mutaln));
    mutaln.set_name(aln.name());
    return mutaln;
}
Пример #4
0
void flip_nodes(Alignment& a, set<int64_t> ids, const std::function<size_t(int64_t)>& node_length) {
    Path* path = a.mutable_path();
    for(size_t i = 0; i < path->mapping_size(); i++) {
        // Grab each mapping (includes its position)
        Mapping* mapping = path->mutable_mapping(i);
        if(ids.count(mapping->position().node_id())) {
            // We need to flip this mapping
            *mapping = reverse_mapping(*mapping, node_length);
        } 
    }
}
Пример #5
0
// generates a perfect alignment from the graph
Alignment Sampler::alignment(size_t length) {
    string seq;
    Alignment aln;
    Path* path = aln.mutable_path();
    pos_t pos = position();
    char c = pos_char(pos);
    // we do something wildly inefficient but conceptually clean
    // for each position in the mapping we add a mapping
    // at the end we will simplify the alignment, merging redundant mappings
    do {
        // add in the char for the current position
        seq += c;
        Mapping* mapping = path->add_mapping();
        *mapping->mutable_position() = make_position(pos);
        Edit* edit = mapping->add_edit();
        edit->set_from_length(1);
        edit->set_to_length(1);
        // decide the next position
        auto nextc = next_pos_chars(pos);
        // no new positions mean we are done; we've reached the end of the graph
        if (nextc.empty()) break;
        // what positions do we go to next?
        vector<pos_t> nextp;
        for (auto& n : nextc) nextp.push_back(n.first);
        // pick one at random
        uniform_int_distribution<int> next_dist(0, nextc.size()-1);
        // update our position
        pos = nextp.at(next_dist(rng));
        // update our char
        c = nextc[pos];
    } while (seq.size() < length);
    // save our sequence in the alignment
    aln.set_sequence(seq);
    aln = simplify(aln);
    { // name the alignment
        string data;
        aln.SerializeToString(&data);
        int n;
#pragma omp critical(nonce)
        n = nonce++;
        data += std::to_string(n);
        const string hash = sha1head(data, 16);
        aln.set_name(hash);
    }
    // and simplify it
    aln.set_identity(identity(aln.path()));
    return aln;
}
Пример #6
0
Alignment strip_from_start(const Alignment& aln, size_t drop) {
    if (!drop) return aln;
    Alignment res;
    res.set_name(aln.name());
    res.set_score(aln.score());
    //cerr << "drop " << drop << " from start" << endl;
    res.set_sequence(aln.sequence().substr(drop));
    if (!aln.has_path()) return res;
    *res.mutable_path() = cut_path(aln.path(), drop).second;
    assert(res.has_path());
    if (alignment_to_length(res) != res.sequence().size()) {
        cerr << "failed!!! drop from start 轰" << endl;
        cerr << pb2json(res) << endl << endl;
        assert(false);
    }
    return res;
}
Пример #7
0
Alignment reverse_alignment(const Alignment& aln, const function<int64_t(int64_t)>& node_length) {
    // We're going to reverse the alignment and all its mappings.
    // TODO: should we/can we do this in place?
    
    Alignment reversed = aln;
    reversed.set_sequence(reverse_complement(aln.sequence()));
    
    if(aln.has_path()) {
    
        // Now invert the order of the mappings, and for each mapping, flip the
        // is_reverse flag. The edits within mappings also get put in reverse
        // order, get their positions corrected, and get their sequences get
        // reverse complemented.
        *reversed.mutable_path() = reverse_path(aln.path(), node_length);
    }
    
    return reversed;
}
Пример #8
0
Alignment strip_from_end(const Alignment& aln, size_t drop) {
    if (!drop) return aln;
    Alignment res;
    res.set_name(aln.name());
    res.set_score(aln.score());
    //cerr << "drop " << drop << " from end" << endl;
    size_t cut_at = aln.sequence().size()-drop;
    //cerr << "Cut at " << cut_at << endl;
    res.set_sequence(aln.sequence().substr(0, cut_at));
    if (!aln.has_path()) return res;
    *res.mutable_path() = cut_path(aln.path(), cut_at).first;
    assert(res.has_path());
    if (alignment_to_length(res) != res.sequence().size()) {
        cerr << "failed!!! drop from end 轰" << endl;
        cerr << pb2json(res) << endl << endl;
        assert(false);
    }
    return res;
}
Пример #9
0
// merge that properly handles long indels
// assumes that alignments should line up end-to-end
Alignment merge_alignments(const vector<Alignment>& alns, bool debug) {

    if (alns.size() == 0) {
        Alignment aln;
        return aln;
    } else if (alns.size() == 1) {
        return alns.front();
    }

    // where possible get node and target lengths
    // to validate after merge
    /*
    map<int64_t, map<size_t, set<const Alignment*> > > node_lengths;
    map<int64_t, map<size_t, set<const Alignment*> > > to_lengths;
    for (auto& aln : alns) {
        auto& path = aln.path();
        // find a mapping that overlaps the whole node
        // note that edits aren't simplified
        // so deletions are intact
        for (size_t i = 0; i < path.mapping_size(); ++i) {
            auto& m = path.mapping(i);
            if (m.position().offset() == 0) {
                // can we see if the next mapping is on the following node
                if (i < path.mapping_size()-1 && path.mapping(i+1).position().offset() == 0
                    && mapping_from_length(path.mapping(i+1)) && mapping_from_length(m)) {
                    // we cover the node, record the to_length and from_length
                    set<const Alignment*>& n = node_lengths[m.position().node_id()][from_length(m)];
                    n.insert(&aln);
                    set<const Alignment*>& t = to_lengths[m.position().node_id()][to_length(m)];
                    t.insert(&aln);
                }
            }
        }
    }
    // verify our input by checking for disagreements
    for (auto& n : node_lengths) {
        auto& node_id = n.first;
        if (n.second.size() > 1) {
            cerr << "disagreement in node lengths for " << node_id << endl;
            for (auto& l : n.second) {
                cerr << "alignments that report length of " << l.first << endl;
                for (auto& a : l.second) {
                    cerr << pb2json(*a) << endl;
                }
            }
        } else {
            //cerr << n.second.begin()->second.size() << " alignments support "
            //     << n.second.begin()->first << " as length for " << node_id << endl;
        }
    }
    */
    
    // parallel merge algorithm
    // for each generation
    // merge 0<-0+1, 1<-2+3, ...
    // until there is only one alignment
    vector<Alignment> last = alns;

    // get the alignments ready for merge
#pragma omp parallel for
    for (size_t i = 0; i < last.size(); ++i) {
        Alignment& aln = last[i];
        //cerr << "on " << i << "th aln" << endl
        //     << pb2json(aln) << endl;
        if (!aln.has_path()) {
            Mapping m;
            Edit* e = m.add_edit();
            e->set_to_length(aln.sequence().size());
            e->set_sequence(aln.sequence());
            *aln.mutable_path()->add_mapping() = m;
        }
    }

    while (last.size() > 1) {
        //cerr << "last size " << last.size() << endl;
        size_t new_count = last.size()/2;
        //cerr << "new count b4 " << new_count << endl;
        new_count += last.size() % 2; // force binary
        //cerr << "New count = " << new_count << endl;
        vector<Alignment> curr; curr.resize(new_count);
#pragma omp parallel for
        for (size_t i = 0; i < curr.size(); ++i) {
            //cerr << "merging " << 2*i << " and " << 2*i+1 << endl;
            // take a pair from the old alignments
            // merge them into this one
            if (2*i+1 < last.size()) {
                auto& a1 = last[2*i];
                auto& a2 = last[2*i+1];
                curr[i] = merge_alignments(a1, a2, debug);
                // check that the merge did the right thing
                /*
                auto& a3 = curr[i];
                for (size_t j = 0; j < a3.path().mapping_size()-1; ++j) {
                    // look up reported node length
                    // and compare to what we saw
                    // skips last mapping
                    auto& m = a3.path().mapping(j);
                    if (from_length(m) == to_length(m)
                        && m.has_position()
                        && m.position().offset()==0
                        && a3.path().mapping(j+1).has_position()
                        && a3.path().mapping(j+1).position().offset()==0) {
                        auto nl = node_lengths.find(m.position().node_id());
                        if (nl != node_lengths.end()) {
                            if (nl->second.find(from_length(m)) == nl->second.end()) {
                                cerr << "node length is not consistent for " << m.position().node_id() << endl;
                                cerr << "expected " << nl->second.begin()->first << endl;
                                cerr << "got " << from_length(m) << endl;
                                cerr << "inputs:" << endl << pb2json(a1) << endl << pb2json(a2)
                                     << endl << "output: " << endl << pb2json(a3) << endl;
                                //exit(1);
                            }
                        }
                    }
                }
                */
            } else {
                auto& a1 = last[2*i];
                //cerr << "no need to merge" << endl;
                curr[i] = a1;
            }
        }
        last = curr;
    }
    Alignment res = last.front();
    *res.mutable_path() = simplify(res.path());
    return res;
}
Пример #10
0
void Aligner::align_internal(Alignment& alignment, vector<Alignment>* multi_alignments, Graph& g,
                             int64_t pinned_node_id, bool pin_left, int32_t max_alt_alns, bool print_score_matrices) {

    // check input integrity
    if (pin_left && !pinned_node_id) {
        cerr << "error:[Aligner] cannot choose pinned end in non-pinned alignment" << endl;
        exit(EXIT_FAILURE);
    }
    if (multi_alignments && !pinned_node_id) {
        cerr << "error:[Aligner] multiple traceback is not valid in local alignment, only pinned and global" << endl;
        exit(EXIT_FAILURE);
    }
    if (!(multi_alignments) && max_alt_alns != 1) {
        cerr << "error:[Aligner] cannot specify maximum number of alignments in single alignment" << endl;
        exit(EXIT_FAILURE);
    }


    // alignment pinning algorithm is based on pinning in bottom right corner, if pinning in top
    // left we need to reverse all the sequences first and translate the alignment back later

    // create reversed objects if necessary
    Graph reversed_graph;
    string reversed_sequence;
    if (pin_left) {
        reversed_sequence.resize(alignment.sequence().length());

        reverse_copy(alignment.sequence().begin(), alignment.sequence().end(), reversed_sequence.begin());
        reverse_graph(g, reversed_graph);
    }

    // choose forward or reversed objects
    Graph* align_graph;
    string* align_sequence;
    if (pin_left) {
        align_graph = &reversed_graph;
        align_sequence = &reversed_sequence;
    }
    else {
        align_graph = &g;
        align_sequence = alignment.mutable_sequence();
    }

    // convert into gssw graph and get the counterpart to pinned node (if pinning)
    gssw_node* pinned_node = nullptr;
    gssw_graph* graph = create_gssw_graph(*align_graph, pinned_node_id, &pinned_node);

    if (pinned_node_id & !pinned_node) {
        cerr << "error:[Aligner] pinned node for pinned alignment is not in graph" << endl;
        exit(EXIT_FAILURE);
    }

    // perform dynamic programming
    gssw_graph_fill(graph, (*align_sequence).c_str(),
                    nt_table, score_matrix,
                    gap_open, gap_extension, 15, 2);

    // traceback either from pinned position or optimal local alignment
    if (pinned_node) {
        // trace back pinned alignment
        gssw_graph_mapping** gms = gssw_graph_trace_back_pinned_multi (graph,
                                   pinned_node,
                                   max_alt_alns,
                                   (*align_sequence).c_str(),
                                   (*align_sequence).size(),
                                   nt_table,
                                   score_matrix,
                                   gap_open,
                                   gap_extension);

        if (pin_left) {
            // translate graph and mappings into original node space
            unreverse_graph(reversed_graph);
            for (int32_t i = 0; i < max_alt_alns; i++) {
                unreverse_graph_mapping(gms[i]);
            }
        }

        // convert optimal alignment and store it in the input Alignment object (in the multi alignment,
        // this will have been set to the first in the vector)
        if (gms[0]->score > 0) {
            // have a mapping, can just convert normally
            gssw_mapping_to_alignment(graph, gms[0], alignment, print_score_matrices);
        }
        else {
            // gssw will not identify mappings with 0 score, infer location based on pinning

            Mapping* mapping = alignment.mutable_path()->add_mapping();
            mapping->set_rank(1);

            // locate at the end of the node
            Position* position = mapping->mutable_position();
            position->set_node_id(pinned_node_id);
            position->set_offset(pin_left ? 0 : pinned_node->len);

            // soft clip
            Edit* edit = mapping->add_edit();
            edit->set_to_length(alignment.sequence().length());
            edit->set_sequence(alignment.sequence());
        }


        if (multi_alignments) {
            // determine how many non-null alignments were returned
            int32_t num_non_null = max_alt_alns;
            for (int32_t i = 1; i < max_alt_alns; i++) {
                if (gms[i]->score <= 0) {
                    num_non_null = i;
                    break;
                }
            }

            // reserve to avoid illegal access errors that occur when the vector reallocates
            multi_alignments->reserve(num_non_null);

            // copy the primary alignment
            multi_alignments->emplace_back(alignment);

            // convert the alternate alignments and store them at the back of the vector (this will not
            // execute if we are doing single alignment)
            for (int32_t i = 1; i < num_non_null; i++) {
                gssw_graph_mapping* gm = gms[i];

                // make new alignment object
                multi_alignments->emplace_back();
                Alignment& next_alignment = multi_alignments->back();

                // copy over sequence information from the primary alignment
                next_alignment.set_sequence(alignment.sequence());
                next_alignment.set_quality(alignment.quality());

                // get path of the alternate alignment
                gssw_mapping_to_alignment(graph, gm, next_alignment, print_score_matrices);

            }
        }

        for (int32_t i = 0; i < max_alt_alns; i++) {
            gssw_graph_mapping_destroy(gms[i]);
        }
        free(gms);
    }
    else {
        // trace back local alignment
        gssw_graph_mapping* gm = gssw_graph_trace_back (graph,
                                 (*align_sequence).c_str(),
                                 (*align_sequence).size(),
                                 nt_table,
                                 score_matrix,
                                 gap_open,
                                 gap_extension);

        gssw_mapping_to_alignment(graph, gm, alignment, print_score_matrices);
        gssw_graph_mapping_destroy(gm);
    }

    //gssw_graph_print_score_matrices(graph, sequence.c_str(), sequence.size(), stderr);

    gssw_graph_destroy(graph);

}
Пример #11
0
void Aligner::gssw_mapping_to_alignment(gssw_graph* graph,
                                        gssw_graph_mapping* gm,
                                        Alignment& alignment,
                                        bool print_score_matrices) {
    alignment.clear_path();
    alignment.set_score(gm->score);
    alignment.set_query_position(0);
    Path* path = alignment.mutable_path();
    //alignment.set_cigar(graph_cigar(gm));

    gssw_graph_cigar* gc = &gm->cigar;
    gssw_node_cigar* nc = gc->elements;
    int to_pos = 0;
    int from_pos = gm->position;
    //cerr << "gm->position " << gm->position << endl;
    string& to_seq = *alignment.mutable_sequence();
    //cerr << "-------------" << endl;

    if (print_score_matrices) {
        gssw_graph_print_score_matrices(graph, to_seq.c_str(), to_seq.size(), stderr);
        //cerr << alignment.DebugString() << endl;
    }

    for (int i = 0; i < gc->length; ++i, ++nc) {

        if (i > 0) from_pos = 0; // reset for each node after the first
        // check that the current alignment has a non-zero length
        gssw_cigar* c = nc->cigar;
        int l = c->length;
        if (l == 0) continue;
        gssw_cigar_element* e = c->elements;

        Node* from_node = (Node*) nc->node->data;
        string& from_seq = *from_node->mutable_sequence();
        Mapping* mapping = path->add_mapping();
        mapping->mutable_position()->set_node_id(nc->node->id);
        mapping->mutable_position()->set_offset(from_pos);
        mapping->set_rank(path->mapping_size());

        //cerr << from_node->id() << ":" << endl;

        for (int j=0; j < l; ++j, ++e) {
            Edit* edit;
            int32_t length = e->length;
            //cerr << e->length << e->type << endl;

            switch (e->type) {
            case 'M':
            case 'X':
            case 'N': {
                // do the sequences match?
                // emit a stream of "SNPs" and matches
                int h = from_pos;
                int last_start = from_pos;
                int k = to_pos;
                for ( ; h < from_pos + length; ++h, ++k) {
                    //cerr << h << ":" << k << " " << from_seq[h] << " " << to_seq[k] << endl;
                    if (from_seq[h] != to_seq[k]) {
                        // emit the last "match" region
                        if (h-last_start > 0) {
                            edit = mapping->add_edit();
                            edit->set_from_length(h-last_start);
                            edit->set_to_length(h-last_start);
                        }
                        // set up the SNP
                        edit = mapping->add_edit();
                        edit->set_from_length(1);
                        edit->set_to_length(1);
                        edit->set_sequence(to_seq.substr(k,1));
                        last_start = h+1;
                    }
                }
                // handles the match at the end or the case of no SNP
                if (h-last_start > 0) {
                    edit = mapping->add_edit();
                    edit->set_from_length(h-last_start);
                    edit->set_to_length(h-last_start);
                }
                to_pos += length;
                from_pos += length;
            }
            break;
            case 'D':
                edit = mapping->add_edit();
                edit->set_from_length(length);
                edit->set_to_length(0);
                from_pos += length;
                break;
            case 'I':
                edit = mapping->add_edit();
                edit->set_from_length(0);
                edit->set_to_length(length);
                edit->set_sequence(to_seq.substr(to_pos, length));
                to_pos += length;
                break;
            case 'S':
                // note that soft clips and insertions are semantically equivalent
                // and can only be differentiated by their position in the read
                // with soft clips coming at the start or end
                edit = mapping->add_edit();
                edit->set_from_length(0);
                edit->set_to_length(length);
                edit->set_sequence(to_seq.substr(to_pos, length));
                to_pos += length;
                break;
            default:
                cerr << "error:[Aligner::gssw_mapping_to_alignment] "
                     << "unsupported cigar op type " << e->type << endl;
                exit(1);
                break;

            }

        }
        //cerr << "path to_length " << path_to_length(*path) << endl;
    }

    // set identity
    alignment.set_identity(identity(alignment.path()));
}