Ejemplo n.º 1
0
Alignment Sampler::mutate(const Alignment& aln,
                          double base_error,
                          double indel_error) {

    if (base_error == 0 && indel_error == 0) return aln;

    string bases = "ATGC";
    uniform_real_distribution<double> rprob(0, 1);
    uniform_int_distribution<int> rbase(0, 3);

    Alignment mutaln;
    for (size_t i = 0; i < aln.path().mapping_size(); ++i) {
        auto& orig_mapping = aln.path().mapping(i);
        Mapping* new_mapping = mutaln.mutable_path()->add_mapping();
        *new_mapping->mutable_position() = orig_mapping.position();
        // for each edit in the mapping
        for (size_t j = 0; j < orig_mapping.edit_size(); ++j) {
            auto& orig_edit = orig_mapping.edit(j);
            auto new_edits = mutate_edit(orig_edit, make_pos_t(orig_mapping.position()),
                                         base_error, indel_error,
                                         bases, rprob, rbase);
            for (auto& edit : new_edits) {
                *new_mapping->add_edit() = edit;
            }
        }
    }
    // re-derive the alignment's sequence
    mutaln = simplify(mutaln);
    mutaln.set_sequence(alignment_seq(mutaln));
    mutaln.set_name(aln.name());
    return mutaln;
}
Ejemplo n.º 2
0
map<pos_t, char> Sampler::next_pos_chars(pos_t pos) {
    map<pos_t, char> nexts;
    
    // See if the node is cached (did we just visit it?)
    pair<Node, bool> cached = node_cache.retrieve(id(pos));
    
    if(!cached.second) {
        // If it's not in the cache, put it in
        cached.first = xgidx->node(id(pos));
        node_cache.put(id(pos), cached.first);
    }
    
    Node& node = cached.first;
    // if we are still in the node, return the next position and character
    if (offset(pos) < node.sequence().size()-1) {
        ++get_offset(pos);
        nexts[pos] = pos_char(pos);
    } else {
        // look at the next positions we could reach
        if (!is_rev(pos)) {
            // we are on the forward strand, the next things from this node come off the end
            for (auto& edge : xgidx->edges_on_end(id(pos))) {
                if (edge.from() == id(pos)) {
                    pos_t p = make_pos_t(edge.to(), edge.to_end(), 0);
                    nexts[p] = pos_char(p);
                } else if (edge.from_start() && edge.to_end() && edge.to() == id(pos)) {
                    // doubly inverted, should be normalized to forward but we handle here for safety
                    pos_t p = make_pos_t(edge.from(), false, 0);
                    nexts[p] = pos_char(p);
                }
            }
        } else {
            // we are on the reverse strand, the next things from this node come off the start
            for (auto& edge : xgidx->edges_on_start(id(pos))) {
                if (edge.to() == id(pos)) {
                    pos_t p = make_pos_t(edge.from(), !edge.from_start(), 0);
                    nexts[p] = pos_char(p);
                } else if (edge.from_start() && edge.to_end() && edge.from() == id(pos)) {
                    // doubly inverted, should be normalized to forward but we handle here for safety
                    pos_t p = make_pos_t(edge.to(), true, 0);
                    nexts[p] = pos_char(p);
                }
            }
        }
    }
    return nexts;
}
Ejemplo n.º 3
0
pos_t Sampler::position(void) {
    uniform_int_distribution<size_t> xdist(1, xgidx->seq_length);
    size_t offset = xdist(rng);
    id_t id = xgidx->node_at_seq_pos(offset);
    uniform_int_distribution<size_t> flip(0, 1);
    bool rev = forward_only ? false : flip(rng);
    // 1-0 base conversion
    size_t node_offset = offset - xgidx->node_start(id) - 1;
    return make_pos_t(id, rev, node_offset);
}
Ejemplo n.º 4
0
vector<edge_t> find_edges_to_prune(const HandleGraph& graph, size_t k, size_t edge_max) {
    // for each position on the forward and reverse of the graph
    //unordered_set<edge_t> edges_to_prune;
    vector<vector<edge_t> > edges_to_prune;
    edges_to_prune.resize(get_thread_count());
    graph.for_each_handle([&](const handle_t& h) {
            // for the forward and reverse of this handle
            // walk k bases from the end, so that any kmer starting on the node will be represented in the tree we build
            for (auto handle_is_rev : { false, true }) {
                //cerr << "###########################################" << endl;
                handle_t handle = handle_is_rev ? graph.flip(h) : h;
                list<walk_t> walks;
                // for each position in the node, set up a kmer with that start position and the node end or kmer length as the end position
                // determine next positions
                id_t handle_id = graph.get_id(handle);
                size_t handle_length = graph.get_length(handle);
                string handle_seq = graph.get_sequence(handle);
                for (size_t i = 0; i < handle_length;  ++i) {
                    pos_t begin = make_pos_t(handle_id, handle_is_rev, handle_length);
                    pos_t end = make_pos_t(handle_id, handle_is_rev, min(handle_length, i+k));
                    walk_t walk = walk_t(offset(end)-offset(begin), begin, end, handle, 0);
                    if (walk.length < k) {
                        // are we branching over more than one edge?
                        size_t next_count = 0;
                        graph.follow_edges(walk.curr, false, [&](const handle_t& next) { ++next_count; });
                        graph.follow_edges(walk.curr, false, [&](const handle_t& next) {
                                if (next_count > 1 && edge_max == walk.forks) { // our next step takes us over the max
                                    int tid = omp_get_thread_num();
                                    edges_to_prune[tid].push_back(graph.edge_handle(walk.curr, next));
                                } else {
                                    walks.push_back(walk);
                                    auto& todo = walks.back();
                                    todo.curr = next;
                                    if (next_count > 1) {
                                        ++todo.forks;
                                    }
                                }
                            });
                    } else {
                        walks.push_back(walk);
                    }
                }
                // now expand the kmers until they reach k
                while (!walks.empty()) {
                    // first we check which ones have reached length k in the current handle; for each of these we run lambda and remove them from our list
                    auto walks_end = walks.end();
                    for (list<walk_t>::iterator q = walks.begin(); q != walks_end; ++q) {
                        auto& walk = *q;
                        // did we reach our target length?
                        if (walk.length >= k) {
                            q = walks.erase(q);
                        } else {
                            id_t curr_id = graph.get_id(walk.curr);
                            size_t curr_length = graph.get_length(walk.curr);
                            bool curr_is_rev = graph.get_is_reverse(walk.curr);
                            size_t take = min(curr_length, k-walk.length);
                            walk.end = make_pos_t(curr_id, curr_is_rev, take);
                            walk.length += take;
                            if (walk.length < k) {
                                // if not, we need to expand through the node then follow on
                                size_t next_count = 0;
                                graph.follow_edges(walk.curr, false, [&](const handle_t& next) { ++next_count; });
                                graph.follow_edges(walk.curr, false, [&](const handle_t& next) {
                                        if (next_count > 1 && edge_max == walk.forks) { // our next step takes us over the max
                                            int tid = omp_get_thread_num();
                                            edges_to_prune[tid].push_back(graph.edge_handle(walk.curr, next));
                                        } else {
                                            walks.push_back(walk);
                                            auto& todo = walks.back();
                                            todo.curr = next;
                                            if (next_count > 1) {
                                                ++todo.forks;
                                            }
                                        }
                                    });
                                q = walks.erase(q);
                            } else {
                                // nothing, we'll remove it next time around
                            }
                        }
                    }
                }
            }
        }, true);
    uint64_t total_edges = 0;
    for (auto& v : edges_to_prune) total_edges += v.size();
    vector<edge_t> merged; merged.reserve(total_edges);
    for (auto& v : edges_to_prune) {
        merged.insert(merged.end(), v.begin(), v.end());
    }
    // duplicates are assumed to be dealt with externally
    return merged;
}
Ejemplo n.º 5
0
std::vector<GaplessExtension> GaplessExtender::maximal_extensions(cluster_type& cluster, const std::string& sequence, bool cluster_is_sorted) const {

    // Process the seeds in sorted order.
    seed_type prev(0, make_pos_t(0, false, 0));
    size_t prev_limit = 0; // Limit in the initial node.
    if (!cluster_is_sorted) {
        std::sort(cluster.begin(), cluster.end());
    }
    std::set<UnambiguousMatch> matches;
    for (size_t i = 0; i < cluster.size(); i++) {

        // Skip redundant seeds.
        seed_type normalized = cluster[i];
        size_t adjustment = std::min(static_cast<size_t>(offset(normalized.second)), normalized.first);
        normalized.first -= adjustment;
        get_offset(normalized.second) -= adjustment;
        if (normalized == prev && offset(cluster[i].second) < prev_limit) {
            continue;
        }
        prev = normalized;
        // prev_limit is updated later when we match the first node.

        // Match the initial node.
        handle_t handle = GBWTGraph::node_to_handle(pos_to_gbwt(cluster[i].second));
        UnambiguousMatch match {
            cluster[i].first, cluster[i].first,
            static_cast<size_t>(offset(cluster[i].second)), static_cast<size_t>(offset(cluster[i].second)),
            this->graph->get_bd_state(handle),
            { handle }            
        };
        std::pair<const char*, size_t> node_view = this->graph->get_sequence_view(handle);
        match_forward(sequence, node_view, match);
        prev_limit = match.node_limit;
        match_backward(sequence, node_view, match);

        // Match forward.
        while (match.node_limit >= node_view.second && match.seq_limit < sequence.length()) {
            bool extension = false, ambiguous = false;
            gbwt::BidirectionalState successor;
            this->graph->follow_paths(match.state, false, [&](const gbwt::BidirectionalState& next_state) -> bool {
                if (ambiguous) {
                    return false;
                }
                if (next_state.empty()) {
                    return true;
                }
                handle_t next_handle = GBWTGraph::node_to_handle(next_state.forward.node);
                if (this->graph->starts_with(next_handle, sequence[match.seq_limit])) {
                    if (extension) {
                        ambiguous = true;
                        return false;
                    } else {
                        extension = true;
                        successor = next_state;
                        handle = next_handle;
                        return true;
                    }
                }
                return true;
            });
            if (extension && !ambiguous) {
                node_view = this->graph->get_sequence_view(handle);
                match.seq_limit++;
                match.node_limit = 1;
                match.state = successor;
                match.path.push_back(handle);
                match_forward(sequence, node_view, match);
            } else {
                break;
            }
        }

        // Match backward.
        while (match.node_start == 0 && match.seq_start > 0) {
            bool extension = false, ambiguous = false;
            gbwt::BidirectionalState successor;
            this->graph->follow_paths(match.state, true, [&](const gbwt::BidirectionalState& next_state) -> bool {
                if (ambiguous) {
                    return false;
                }
                if (next_state.empty()) {
                    return true;
                }
                handle_t next_handle = GBWTGraph::node_to_handle(gbwt::Node::reverse(next_state.backward.node));
                if (this->graph->ends_with(next_handle, sequence[match.seq_start - 1])) {
                    if (extension) {
                        ambiguous = true;
                        return false;
                    } else {
                        extension = true;
                        successor = next_state;
                        handle = next_handle;
                        return true;
                    }
                }
                return true;
            });
            if (extension && !ambiguous) {
                node_view = this->graph->get_sequence_view(handle);
                match.seq_start--;
                match.node_start = node_view.second - 1;
                match.state = successor;
                match.path.insert(match.path.begin(), handle);
                match_backward(sequence, node_view, match);
            } else {
                break;
            }
        }

        if (!match.empty()) {
            matches.insert(match);
        }
    }

    // Convert the matches to GaplessExtension objects.
    std::vector<GaplessExtension> result;
    result.reserve(matches.size());
    for(const UnambiguousMatch& match : matches) {
        result.emplace_back(unambiguous_match_to_extension(match, *(this->graph), sequence));
    }
    return result;
}
Ejemplo n.º 6
0
GaplessExtension GaplessExtender::extend_seeds(cluster_type& cluster, const std::string& sequence, size_t max_mismatches, bool cluster_is_sorted) const {

    GaplessMatch best_match {
        max_mismatches + 1,
        static_cast<size_t>(0), static_cast<size_t>(0),
        static_cast<size_t>(0),
        { },
        { }
    };
    if (this->graph == nullptr) {
        return match_to_extension(best_match, *(this->graph), sequence);
    }

    // Process the seeds in sorted order.
    seed_type prev(0, make_pos_t(0, false, 0));
    if (!cluster_is_sorted) {
        std::sort(cluster.begin(), cluster.end());
    }
    for (size_t i = 0; i < cluster.size(); i++) {

        // Start matching as early in the initial node as possible.
        seed_type hit = cluster[i];
        size_t adjustment = std::min(static_cast<size_t>(offset(hit.second)), hit.first);
        get_offset(hit.second) -= adjustment;
        hit.first -= adjustment;
        if (hit == prev) {
            continue; // This seed was redundant.
        }
        prev = hit;

        // Match the initial node.
        std::priority_queue<GaplessMatch> forward, backward;
        {
            handle_t handle = GBWTGraph::node_to_handle(pos_to_gbwt(hit.second));
            GaplessMatch match {
                static_cast<size_t>(0),
                hit.first, hit.first,
                static_cast<size_t>(offset(hit.second)),
                this->graph->get_bd_state(handle),
                { },
                { }            
            };
            match_forward(sequence, this->graph->get_sequence_view(handle), match.offset, match, best_match.score);
            if (match.score >= best_match.score) { 
                continue;
            } else {
                match.path.push_back(handle);
            }
            if (match.limit >= sequence.length()) {
                if (match.start == 0) {
                    best_match = match;
                } else {
                    backward.push(match);
                }
            } else {
                forward.push(match);
            }
            if (best_match.score == 0) {
                return match_to_extension(best_match, *(this->graph), sequence);
            }
        }

        // Match forward over all paths.
        while (!forward.empty()) {
            GaplessMatch curr = forward.top();
            forward.pop();
            this->graph->follow_paths(curr.state, false, [&](const gbwt::BidirectionalState& next_state) -> bool {
                if (next_state.empty()) {
                    return true;
                }
                handle_t handle = GBWTGraph::node_to_handle(next_state.forward.node);
                GaplessMatch next {
                    curr.score,
                    curr.start, curr.limit,
                    curr.offset,
                    next_state,
                    { },
                    { }
                };
                match_forward(sequence, this->graph->get_sequence_view(handle), 0, next, best_match.score);
                if (next.score >= best_match.score) {
                    return true;
                } else {
                    next.path.reserve(curr.path.size() + 1);
                    next.path.insert(next.path.end(), curr.path.begin(), curr.path.end());
                    next.path.push_back(handle);
                    next.mismatches.insert(next.mismatches.end(), curr.mismatches.begin(), curr.mismatches.end());
                }
                if (next.limit >= sequence.length()) {
                    if (next.start == 0) {
                        best_match = next;
                    } else {
                        backward.push(next);
                    }
                } else {
                    forward.push(next);
                }
                return true;
            });
            if (best_match.score == 0) {
                return match_to_extension(best_match, *(this->graph), sequence);
            }
        }

        // Match backward over all paths.
        while (!backward.empty()) {
            GaplessMatch curr = backward.top();
            backward.pop();
            this->graph->follow_paths(curr.state, true, [&](const gbwt::BidirectionalState& next_state) -> bool {
                if (next_state.empty()) {
                    return true;
                }
                handle_t handle = GBWTGraph::node_to_handle(gbwt::Node::reverse(next_state.backward.node));
                GaplessMatch next {
                    curr.score,
                    curr.start, curr.limit,
                    curr.offset, // This will be replaced in match_backward().
                    next_state,
                    { },
                    { }
                };
                match_backward(sequence, this->graph->get_sequence_view(handle), next, best_match.score);
                if (next.score >= best_match.score) {
                    return true;
                } else {
                    next.path.reserve(curr.path.size() + 1);
                    next.path.push_back(handle);
                    next.path.insert(next.path.end(), curr.path.begin(), curr.path.end());
                    next.mismatches.insert(next.mismatches.end(), curr.mismatches.begin(), curr.mismatches.end());
                }
                if (next.start == 0) {
                    best_match = next;
                } else {
                    backward.push(next);
                }
                return true;
            });
            if (best_match.score == 0) {
                return match_to_extension(best_match, *(this->graph), sequence);
            }
        }
    }

    return match_to_extension(best_match, *(this->graph), sequence);
}