Alignment Sampler::mutate(const Alignment& aln, double base_error, double indel_error) { if (base_error == 0 && indel_error == 0) return aln; string bases = "ATGC"; uniform_real_distribution<double> rprob(0, 1); uniform_int_distribution<int> rbase(0, 3); Alignment mutaln; for (size_t i = 0; i < aln.path().mapping_size(); ++i) { auto& orig_mapping = aln.path().mapping(i); Mapping* new_mapping = mutaln.mutable_path()->add_mapping(); *new_mapping->mutable_position() = orig_mapping.position(); // for each edit in the mapping for (size_t j = 0; j < orig_mapping.edit_size(); ++j) { auto& orig_edit = orig_mapping.edit(j); auto new_edits = mutate_edit(orig_edit, make_pos_t(orig_mapping.position()), base_error, indel_error, bases, rprob, rbase); for (auto& edit : new_edits) { *new_mapping->add_edit() = edit; } } } // re-derive the alignment's sequence mutaln = simplify(mutaln); mutaln.set_sequence(alignment_seq(mutaln)); mutaln.set_name(aln.name()); return mutaln; }
map<pos_t, char> Sampler::next_pos_chars(pos_t pos) { map<pos_t, char> nexts; // See if the node is cached (did we just visit it?) pair<Node, bool> cached = node_cache.retrieve(id(pos)); if(!cached.second) { // If it's not in the cache, put it in cached.first = xgidx->node(id(pos)); node_cache.put(id(pos), cached.first); } Node& node = cached.first; // if we are still in the node, return the next position and character if (offset(pos) < node.sequence().size()-1) { ++get_offset(pos); nexts[pos] = pos_char(pos); } else { // look at the next positions we could reach if (!is_rev(pos)) { // we are on the forward strand, the next things from this node come off the end for (auto& edge : xgidx->edges_on_end(id(pos))) { if (edge.from() == id(pos)) { pos_t p = make_pos_t(edge.to(), edge.to_end(), 0); nexts[p] = pos_char(p); } else if (edge.from_start() && edge.to_end() && edge.to() == id(pos)) { // doubly inverted, should be normalized to forward but we handle here for safety pos_t p = make_pos_t(edge.from(), false, 0); nexts[p] = pos_char(p); } } } else { // we are on the reverse strand, the next things from this node come off the start for (auto& edge : xgidx->edges_on_start(id(pos))) { if (edge.to() == id(pos)) { pos_t p = make_pos_t(edge.from(), !edge.from_start(), 0); nexts[p] = pos_char(p); } else if (edge.from_start() && edge.to_end() && edge.from() == id(pos)) { // doubly inverted, should be normalized to forward but we handle here for safety pos_t p = make_pos_t(edge.to(), true, 0); nexts[p] = pos_char(p); } } } } return nexts; }
pos_t Sampler::position(void) { uniform_int_distribution<size_t> xdist(1, xgidx->seq_length); size_t offset = xdist(rng); id_t id = xgidx->node_at_seq_pos(offset); uniform_int_distribution<size_t> flip(0, 1); bool rev = forward_only ? false : flip(rng); // 1-0 base conversion size_t node_offset = offset - xgidx->node_start(id) - 1; return make_pos_t(id, rev, node_offset); }
vector<edge_t> find_edges_to_prune(const HandleGraph& graph, size_t k, size_t edge_max) { // for each position on the forward and reverse of the graph //unordered_set<edge_t> edges_to_prune; vector<vector<edge_t> > edges_to_prune; edges_to_prune.resize(get_thread_count()); graph.for_each_handle([&](const handle_t& h) { // for the forward and reverse of this handle // walk k bases from the end, so that any kmer starting on the node will be represented in the tree we build for (auto handle_is_rev : { false, true }) { //cerr << "###########################################" << endl; handle_t handle = handle_is_rev ? graph.flip(h) : h; list<walk_t> walks; // for each position in the node, set up a kmer with that start position and the node end or kmer length as the end position // determine next positions id_t handle_id = graph.get_id(handle); size_t handle_length = graph.get_length(handle); string handle_seq = graph.get_sequence(handle); for (size_t i = 0; i < handle_length; ++i) { pos_t begin = make_pos_t(handle_id, handle_is_rev, handle_length); pos_t end = make_pos_t(handle_id, handle_is_rev, min(handle_length, i+k)); walk_t walk = walk_t(offset(end)-offset(begin), begin, end, handle, 0); if (walk.length < k) { // are we branching over more than one edge? size_t next_count = 0; graph.follow_edges(walk.curr, false, [&](const handle_t& next) { ++next_count; }); graph.follow_edges(walk.curr, false, [&](const handle_t& next) { if (next_count > 1 && edge_max == walk.forks) { // our next step takes us over the max int tid = omp_get_thread_num(); edges_to_prune[tid].push_back(graph.edge_handle(walk.curr, next)); } else { walks.push_back(walk); auto& todo = walks.back(); todo.curr = next; if (next_count > 1) { ++todo.forks; } } }); } else { walks.push_back(walk); } } // now expand the kmers until they reach k while (!walks.empty()) { // first we check which ones have reached length k in the current handle; for each of these we run lambda and remove them from our list auto walks_end = walks.end(); for (list<walk_t>::iterator q = walks.begin(); q != walks_end; ++q) { auto& walk = *q; // did we reach our target length? if (walk.length >= k) { q = walks.erase(q); } else { id_t curr_id = graph.get_id(walk.curr); size_t curr_length = graph.get_length(walk.curr); bool curr_is_rev = graph.get_is_reverse(walk.curr); size_t take = min(curr_length, k-walk.length); walk.end = make_pos_t(curr_id, curr_is_rev, take); walk.length += take; if (walk.length < k) { // if not, we need to expand through the node then follow on size_t next_count = 0; graph.follow_edges(walk.curr, false, [&](const handle_t& next) { ++next_count; }); graph.follow_edges(walk.curr, false, [&](const handle_t& next) { if (next_count > 1 && edge_max == walk.forks) { // our next step takes us over the max int tid = omp_get_thread_num(); edges_to_prune[tid].push_back(graph.edge_handle(walk.curr, next)); } else { walks.push_back(walk); auto& todo = walks.back(); todo.curr = next; if (next_count > 1) { ++todo.forks; } } }); q = walks.erase(q); } else { // nothing, we'll remove it next time around } } } } } }, true); uint64_t total_edges = 0; for (auto& v : edges_to_prune) total_edges += v.size(); vector<edge_t> merged; merged.reserve(total_edges); for (auto& v : edges_to_prune) { merged.insert(merged.end(), v.begin(), v.end()); } // duplicates are assumed to be dealt with externally return merged; }
std::vector<GaplessExtension> GaplessExtender::maximal_extensions(cluster_type& cluster, const std::string& sequence, bool cluster_is_sorted) const { // Process the seeds in sorted order. seed_type prev(0, make_pos_t(0, false, 0)); size_t prev_limit = 0; // Limit in the initial node. if (!cluster_is_sorted) { std::sort(cluster.begin(), cluster.end()); } std::set<UnambiguousMatch> matches; for (size_t i = 0; i < cluster.size(); i++) { // Skip redundant seeds. seed_type normalized = cluster[i]; size_t adjustment = std::min(static_cast<size_t>(offset(normalized.second)), normalized.first); normalized.first -= adjustment; get_offset(normalized.second) -= adjustment; if (normalized == prev && offset(cluster[i].second) < prev_limit) { continue; } prev = normalized; // prev_limit is updated later when we match the first node. // Match the initial node. handle_t handle = GBWTGraph::node_to_handle(pos_to_gbwt(cluster[i].second)); UnambiguousMatch match { cluster[i].first, cluster[i].first, static_cast<size_t>(offset(cluster[i].second)), static_cast<size_t>(offset(cluster[i].second)), this->graph->get_bd_state(handle), { handle } }; std::pair<const char*, size_t> node_view = this->graph->get_sequence_view(handle); match_forward(sequence, node_view, match); prev_limit = match.node_limit; match_backward(sequence, node_view, match); // Match forward. while (match.node_limit >= node_view.second && match.seq_limit < sequence.length()) { bool extension = false, ambiguous = false; gbwt::BidirectionalState successor; this->graph->follow_paths(match.state, false, [&](const gbwt::BidirectionalState& next_state) -> bool { if (ambiguous) { return false; } if (next_state.empty()) { return true; } handle_t next_handle = GBWTGraph::node_to_handle(next_state.forward.node); if (this->graph->starts_with(next_handle, sequence[match.seq_limit])) { if (extension) { ambiguous = true; return false; } else { extension = true; successor = next_state; handle = next_handle; return true; } } return true; }); if (extension && !ambiguous) { node_view = this->graph->get_sequence_view(handle); match.seq_limit++; match.node_limit = 1; match.state = successor; match.path.push_back(handle); match_forward(sequence, node_view, match); } else { break; } } // Match backward. while (match.node_start == 0 && match.seq_start > 0) { bool extension = false, ambiguous = false; gbwt::BidirectionalState successor; this->graph->follow_paths(match.state, true, [&](const gbwt::BidirectionalState& next_state) -> bool { if (ambiguous) { return false; } if (next_state.empty()) { return true; } handle_t next_handle = GBWTGraph::node_to_handle(gbwt::Node::reverse(next_state.backward.node)); if (this->graph->ends_with(next_handle, sequence[match.seq_start - 1])) { if (extension) { ambiguous = true; return false; } else { extension = true; successor = next_state; handle = next_handle; return true; } } return true; }); if (extension && !ambiguous) { node_view = this->graph->get_sequence_view(handle); match.seq_start--; match.node_start = node_view.second - 1; match.state = successor; match.path.insert(match.path.begin(), handle); match_backward(sequence, node_view, match); } else { break; } } if (!match.empty()) { matches.insert(match); } } // Convert the matches to GaplessExtension objects. std::vector<GaplessExtension> result; result.reserve(matches.size()); for(const UnambiguousMatch& match : matches) { result.emplace_back(unambiguous_match_to_extension(match, *(this->graph), sequence)); } return result; }
GaplessExtension GaplessExtender::extend_seeds(cluster_type& cluster, const std::string& sequence, size_t max_mismatches, bool cluster_is_sorted) const { GaplessMatch best_match { max_mismatches + 1, static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0), { }, { } }; if (this->graph == nullptr) { return match_to_extension(best_match, *(this->graph), sequence); } // Process the seeds in sorted order. seed_type prev(0, make_pos_t(0, false, 0)); if (!cluster_is_sorted) { std::sort(cluster.begin(), cluster.end()); } for (size_t i = 0; i < cluster.size(); i++) { // Start matching as early in the initial node as possible. seed_type hit = cluster[i]; size_t adjustment = std::min(static_cast<size_t>(offset(hit.second)), hit.first); get_offset(hit.second) -= adjustment; hit.first -= adjustment; if (hit == prev) { continue; // This seed was redundant. } prev = hit; // Match the initial node. std::priority_queue<GaplessMatch> forward, backward; { handle_t handle = GBWTGraph::node_to_handle(pos_to_gbwt(hit.second)); GaplessMatch match { static_cast<size_t>(0), hit.first, hit.first, static_cast<size_t>(offset(hit.second)), this->graph->get_bd_state(handle), { }, { } }; match_forward(sequence, this->graph->get_sequence_view(handle), match.offset, match, best_match.score); if (match.score >= best_match.score) { continue; } else { match.path.push_back(handle); } if (match.limit >= sequence.length()) { if (match.start == 0) { best_match = match; } else { backward.push(match); } } else { forward.push(match); } if (best_match.score == 0) { return match_to_extension(best_match, *(this->graph), sequence); } } // Match forward over all paths. while (!forward.empty()) { GaplessMatch curr = forward.top(); forward.pop(); this->graph->follow_paths(curr.state, false, [&](const gbwt::BidirectionalState& next_state) -> bool { if (next_state.empty()) { return true; } handle_t handle = GBWTGraph::node_to_handle(next_state.forward.node); GaplessMatch next { curr.score, curr.start, curr.limit, curr.offset, next_state, { }, { } }; match_forward(sequence, this->graph->get_sequence_view(handle), 0, next, best_match.score); if (next.score >= best_match.score) { return true; } else { next.path.reserve(curr.path.size() + 1); next.path.insert(next.path.end(), curr.path.begin(), curr.path.end()); next.path.push_back(handle); next.mismatches.insert(next.mismatches.end(), curr.mismatches.begin(), curr.mismatches.end()); } if (next.limit >= sequence.length()) { if (next.start == 0) { best_match = next; } else { backward.push(next); } } else { forward.push(next); } return true; }); if (best_match.score == 0) { return match_to_extension(best_match, *(this->graph), sequence); } } // Match backward over all paths. while (!backward.empty()) { GaplessMatch curr = backward.top(); backward.pop(); this->graph->follow_paths(curr.state, true, [&](const gbwt::BidirectionalState& next_state) -> bool { if (next_state.empty()) { return true; } handle_t handle = GBWTGraph::node_to_handle(gbwt::Node::reverse(next_state.backward.node)); GaplessMatch next { curr.score, curr.start, curr.limit, curr.offset, // This will be replaced in match_backward(). next_state, { }, { } }; match_backward(sequence, this->graph->get_sequence_view(handle), next, best_match.score); if (next.score >= best_match.score) { return true; } else { next.path.reserve(curr.path.size() + 1); next.path.push_back(handle); next.path.insert(next.path.end(), curr.path.begin(), curr.path.end()); next.mismatches.insert(next.mismatches.end(), curr.mismatches.begin(), curr.mismatches.end()); } if (next.start == 0) { best_match = next; } else { backward.push(next); } return true; }); if (best_match.score == 0) { return match_to_extension(best_match, *(this->graph), sequence); } } } return match_to_extension(best_match, *(this->graph), sequence); }