handle_t HandleGraph::get_handle(const Visit& visit) const { return get_handle(visit.node_id(), visit.backward()); }
pair<size_t, size_t> Simplifier::simplify_once(size_t iteration) { // Set up the deleted node and edge counts pair<size_t, size_t> to_return {0, 0}; auto& deleted_nodes = to_return.first; auto& deleted_edges = to_return.second; if(!graph.is_valid(true, true, true, true)) { // Make sure the graph is valid and not missing nodes or edges cerr << "error:[vg::Simplifier] Invalid graph on iteration " << iteration << endl; exit(1); } // Make a list of leaf sites list<const Snarl*> leaves; if (show_progress) { cerr << "Iteration " << iteration << ": Scanning " << graph.node_count() << " nodes and " << graph.edge_count() << " edges for sites..." << endl; } for (const Snarl* top_level_site : site_manager.top_level_snarls()) { list<const Snarl*> queue {top_level_site}; while (queue.size()) { const Snarl* site = queue.front(); queue.pop_front(); if (site_manager.is_leaf(site)) { // It's a leaf. Filter it out if it is trivial if (site->type() == ULTRABUBBLE) { auto contents = site_manager.shallow_contents(site, graph, false); if (contents.first.empty()) { // Nothing but the boundary nodes in this snarl continue; } } // Not trivial. Keep it. leaves.push_back(site); } else { for (const Snarl* child_site : site_manager.children_of(site)) { queue.push_back(child_site); } } } } if (show_progress) { cerr << "Found " << leaves.size() << " leaves" << endl; } // Index all the graph paths map<string, unique_ptr<PathIndex>> path_indexes; graph.paths.for_each_name([&](const string& name) { // For every path name, go index it and put it in this collection path_indexes.insert(make_pair(name, move(unique_ptr<PathIndex>(new PathIndex(graph, name))))); }); // Now we have a list of all the leaf sites. create_progress("simplifying leaves", leaves.size()); // We can't use the SnarlManager after we modify the graph, so we load the // contents of all the leaves we're going to modify first. map<const Snarl*, pair<unordered_set<Node*>, unordered_set<Edge*>>> leaf_contents; // How big is each leaf in bp map<const Snarl*, size_t> leaf_sizes; // We also need to pre-calculate the traversals for the snarls that are the // right size, since the traversal finder uses the snarl manager amd might // not work if we modify the graph. map<const Snarl*, vector<SnarlTraversal>> leaf_traversals; for (const Snarl* leaf : leaves) { // Look at all the leaves // Get the contents of the bubble, excluding the boundary nodes leaf_contents[leaf] = site_manager.deep_contents(leaf, graph, false); // For each leaf, calculate its total size. unordered_set<Node*>& nodes = leaf_contents[leaf].first; size_t& total_size = leaf_sizes[leaf]; for (Node* node : nodes) { // For each node include it in the size figure total_size += node->sequence().size(); } if (total_size == 0) { // This site is just the start and end nodes, so it doesn't make // sense to try and remove it. continue; } if (total_size >= min_size) { // This site is too big to remove continue; } // Identify the replacement traversal for the bubble if it's the right size. // We can't necessarily do this after we've modified the graph. vector<SnarlTraversal>& traversals = leaf_traversals[leaf]; traversals = traversal_finder.find_traversals(*leaf); } for (const Snarl* leaf : leaves) { // Look at all the leaves // Get the contents of the bubble, excluding the boundary nodes unordered_set<Node*>& nodes = leaf_contents[leaf].first; unordered_set<Edge*>& edges = leaf_contents[leaf].second; // For each leaf, grab its total size. size_t& total_size = leaf_sizes[leaf]; if (total_size == 0) { // This site is just the start and end nodes, so it doesn't make // sense to try and remove it. continue; } if (total_size >= min_size) { // This site is too big to remove continue; } #ifdef debug cerr << "Found " << total_size << " bp leaf" << endl; for (auto* node : nodes) { cerr << "\t" << node->id() << ": " << node->sequence() << endl; } #endif // Otherwise we want to simplify this site away // Grab the replacement traversal for the bubble vector<SnarlTraversal>& traversals = leaf_traversals[leaf]; if (traversals.empty()) { // We couldn't find any paths through the site. continue; } // Get the traversal out of the vector SnarlTraversal& traversal = traversals.front(); // Determine the length of the new traversal size_t new_site_length = 0; for (size_t i = 1; i < traversal.visit_size() - 1; i++) { // For every non-anchoring node const Visit& visit = traversal.visit(i); // Total up the lengths of all the nodes that are newly visited. assert(visit.node_id()); new_site_length += graph.get_node(visit.node_id())->sequence().size(); } #ifdef debug cerr << "Chosen traversal is " << new_site_length << " bp" << endl; #endif // Now we have to rewrite paths that visit nodes/edges not on this // traversal, or in a different order, or whatever. To be safe we'll // just rewrite all paths. // Find all the paths that traverse this region. // We start at the start node. Copy out all the mapping pointers on that // node, so we can go through them while tampering with them. map<string, set<Mapping*> > mappings_by_path = graph.paths.get_node_mapping(graph.get_node(leaf->start().node_id())); // It's possible a path can enter the site through the end node and // never hit the start. So we're going to trim those back before we delete nodes and edges. map<string, set<Mapping*> > end_mappings_by_path = graph.paths.get_node_mapping(graph.get_node(leaf->end().node_id())); if (!drop_hairpin_paths) { // We shouldn't drop paths if they hairpin and can't be represented // in a simplified bubble. So we instead have to not simplify // bubbles that would have that problem. bool found_hairpin = false; for (auto& kv : mappings_by_path) { // For each path that hits the start node if (found_hairpin) { // We only care if there are 1 or more hairpins, not how many break; } // Unpack the name auto& path_name = kv.first; for (Mapping* start_mapping : kv.second) { // For each visit to the start node if (found_hairpin) { // We only care if there are 1 or more hairpins, not how many break; } // Determine what orientation we're going to scan in bool backward = start_mapping->position().is_reverse(); // Start at the start node Mapping* here = start_mapping; while (here) { // Until we hit the start/end of the path or the mapping we want if (here->position().node_id() == leaf->end().node_id() && here->position().is_reverse() == (leaf->end().backward() != backward)) { // We made it out. // Stop scanning! break; } if (here->position().node_id() == leaf->start().node_id() && here->position().is_reverse() != (leaf->start().backward() != backward)) { // We have encountered the start node with an incorrect orientation. cerr << "warning:[vg simplify] Path " << path_name << " doubles back through start of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; skipping site!" << endl; found_hairpin = true; break; } // Scan left along ther path if we found the site start backwards, and right if we found it forwards. here = backward ? graph.paths.traverse_left(here) : graph.paths.traverse_right(here); } } } for (auto& kv : end_mappings_by_path) { // For each path that hits the end node if (found_hairpin) { // We only care if there are 1 or more hairpins, not how many break; } // Unpack the name auto& path_name = kv.first; for (Mapping* end_mapping : kv.second) { if (found_hairpin) { // We only care if there are 1 or more hairpins, not how many break; } // Determine what orientation we're going to scan in bool backward = end_mapping->position().is_reverse(); // Start at the end Mapping* here = end_mapping; while (here) { if (here->position().node_id() == leaf->start().node_id() && here->position().is_reverse() == (leaf->start().backward() != backward)) { // We made it out. // Stop scanning! break; } if (here->position().node_id() == leaf->end().node_id() && here->position().is_reverse() != (leaf->end().backward() != backward)) { // We have encountered the end node with an incorrect orientation. cerr << "warning:[vg simplify] Path " << path_name << " doubles back through end of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; dropping site!" << endl; found_hairpin = true; break; } // Scan right along the path if we found the site end backwards, and left if we found it forwards. here = backward ? graph.paths.traverse_right(here) : graph.paths.traverse_left(here); } } } if (found_hairpin) { // We found a hairpin, so we want to skip the site. cerr << "warning:[vg simplify] Site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << " skipped due to hairpin path." << endl; continue; } } // We'll keep a set of the end mappings we managed to find, starting from the start set<Mapping*> found_end_mappings; for (auto& kv : mappings_by_path) { // For each path that hits the start node // Unpack the name auto& path_name = kv.first; // If a path can't be represented after a bubble is popped // (because the path reversed and came out the same side as it // went in), we just clobber the path entirely. TODO: handle // out-the-same-side traversals as valid genotypes somehow.. bool kill_path = false; for (Mapping* start_mapping : kv.second) { // For each visit to the start node // Determine what orientation we're going to scan in bool backward = start_mapping->position().is_reverse(); // We're going to fill this list with the mappings we need to // remove and replace in this path for this traversal. Initially // runs from start of site to end of site, but later gets // flipped into path-local orientation. list<Mapping*> existing_mappings; // Tracing along forward/backward from each as appropriate, see // if the end of the site is found in the expected orientation // (or if the path ends first). bool found_end = false; Mapping* here = start_mapping; // We want to remember the end mapping when we find it Mapping* end_mapping = nullptr; #ifdef debug cerr << "Scanning " << path_name << " from " << pb2json(*here) << " for " << to_node_traversal(leaf->end(), graph) << " orientation " << backward << endl; #endif while (here) { // Until we hit the start/end of the path or the mapping we want #ifdef debug cerr << "\tat " << pb2json(*here) << endl; #endif if (here->position().node_id() == leaf->end().node_id() && here->position().is_reverse() == (leaf->end().backward() != backward)) { // We have encountered the end of the site in the // orientation we expect, given the orientation we saw // for the start. found_end = true; end_mapping = here; // Know we got to this mapping at the end from the // start, so we don't need to clobber everything // before it. found_end_mappings.insert(here); // Stop scanning! break; } if (here->position().node_id() == leaf->start().node_id() && here->position().is_reverse() != (leaf->start().backward() != backward)) { // We have encountered the start node with an incorrect orientation. cerr << "warning:[vg simplify] Path " << path_name << " doubles back through start of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; dropping!" << endl; assert(drop_hairpin_paths); kill_path = true; break; } if (!nodes.count(graph.get_node(here->position().node_id()))) { // We really should stay inside the site! cerr << "error:[vg simplify] Path " << path_name << " somehow escapes site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << endl; exit(1); } if (here != start_mapping) { // Remember the mappings that aren't to the start or // end of the site, so we can remove them later. existing_mappings.push_back(here); } // Scan left along ther path if we found the site start backwards, and right if we found it forwards. Mapping* next = backward ? graph.paths.traverse_left(here) : graph.paths.traverse_right(here); if (next == nullptr) { // We hit the end of the path without finding the end of the site. // We've found all the existing mappings, so we can stop. break; } // Make into NodeTraversals NodeTraversal here_traversal(graph.get_node(here->position().node_id()), here->position().is_reverse()); NodeTraversal next_traversal(graph.get_node(next->position().node_id()), next->position().is_reverse()); if (backward) { // We're scanning the other way std::swap(here_traversal, next_traversal); } // Make sure we have an edge so we can traverse this node and then the node we're going to. if(graph.get_edge(here_traversal, next_traversal) == nullptr) { cerr << "error:[vg::Simplifier] No edge " << here_traversal << " to " << next_traversal << endl; exit(1); } here = next; } if (kill_path) { // This path can't exist after we pop this bubble. break; } if (!found_end) { // This path only partly traverses the site, and is // anchored at the start. Remove the part inside the site. // TODO: let it stay if it matches the one true traversal. for(auto* mapping : existing_mappings) { // Trim the path out of the site graph.paths.remove_mapping(mapping); } // TODO: update feature positions if we trim off the start of a path // Maybe the next time the path visits the site it will go // all the way through. continue; } // If we found the end, remove all the mappings encountered, in // order so that the last one removed is the last one along the // path. if (backward) { // Make sure the last mapping in the list is the last // mapping to occur along the path. existing_mappings.reverse(); } // Where does the variable region of the site start for this // traversal of the path? If there are no existing mappings, // it's the start mapping's position if we traverse the site // backwards and the end mapping's position if we traverse // the site forwards. If there are existing mappings, it's // the first existing mapping's position in the path. TODO: // This is super ugly. Can we view the site in path // coordinates or something? PathIndex& path_index = *path_indexes.at(path_name).get(); Mapping* mapping_after_first = existing_mappings.empty() ? (backward ? start_mapping : end_mapping) : existing_mappings.front(); assert(path_index.mapping_positions.count(mapping_after_first)); size_t variable_start = path_index.mapping_positions.at(mapping_after_first); // Determine the total length of the old traversal of the site size_t old_site_length = 0; for (auto* mapping : existing_mappings) { // Add in the lengths of all the mappings that will get // removed. old_site_length += mapping_from_length(*mapping); } #ifdef debug cerr << "Replacing " << old_site_length << " bp at " << variable_start << " with " << new_site_length << " bp" << endl; #endif // Actually update any BED features features.on_path_edit(path_name, variable_start, old_site_length, new_site_length); // Where will we insert the new site traversal into the path? list<Mapping>::iterator insert_position; if (!existing_mappings.empty()) { // If there are existing internal mappings, we'll insert right where they were for (auto* mapping : existing_mappings) { // Remove each mapping from left to right along the // path, saving the position after the mapping we just // removed. At the end we'll have the position of the // mapping to the end of the site. #ifdef debug cerr << path_name << ": Drop mapping " << pb2json(*mapping) << endl; #endif insert_position = graph.paths.remove_mapping(mapping); } } else { // Otherwise we'll insert right before the mapping to // the start or end of the site (whichever occurs last // along the path) insert_position = graph.paths.find_mapping(backward ? start_mapping : here); } // Make sure we're going to insert starting from the correct end of the site. if (backward) { assert(insert_position->position().node_id() == leaf->start().node_id()); } else { assert(insert_position->position().node_id() == leaf->end().node_id()); } // Loop through the internal visits in the canonical // traversal backwards along the path we are splicing. If // it's a forward path this is just right to left, but if // it's a reverse path it has to be left to right. for (size_t i = 0; i < traversal.visit_size(); i++) { // Find the visit we need next, as a function of which // way we need to insert this run of visits. Normally we // go through the visits right to left, but when we have // a backward path we go left to right. const Visit& visit = backward ? traversal.visit(i) : traversal.visit(traversal.visit_size() - i - 1); // Make a Mapping to represent it Mapping new_mapping; new_mapping.mutable_position()->set_node_id(visit.node_id()); // We hit this node backward if it's backward along the // traversal, xor if we are traversing the traversal // backward new_mapping.mutable_position()->set_is_reverse(visit.backward() != backward); // Add an edit Edit* edit = new_mapping.add_edit(); size_t node_seq_length = graph.get_node(visit.node_id())->sequence().size(); edit->set_from_length(node_seq_length); edit->set_to_length(node_seq_length); #ifdef debug cerr << path_name << ": Add mapping " << pb2json(new_mapping) << endl; #endif // Insert the mapping in the path, moving right to left insert_position = graph.paths.insert_mapping(insert_position, path_name, new_mapping); } // Now we've corrected this site on this path. Update its index. // TODO: right now this means retracing the entire path. path_indexes[path_name].get()->update_mapping_positions(graph, path_name); } if (kill_path) { // Destroy the path completely, because it needs to reverse // inside a site that we have popped. graph.paths.remove_path(path_name); } } for (auto& kv : end_mappings_by_path) { // Now we handle the end mappings not reachable from the start. For each path that touches the end... // Unpack the name auto& path_name = kv.first; // We might have to kill the path, if it reverses inside a // bubble we're popping bool kill_path = false; for (Mapping* end_mapping : kv.second) { if (found_end_mappings.count(end_mapping)) { // Skip the traversals of the site that we handled. continue; } // Now we're left with paths that leave the site but don't // enter. We're going to clobber everything before the path // leaves the site. // Determine what orientation we're going to scan in bool backward = end_mapping->position().is_reverse(); // Start at the end Mapping* here = end_mapping; // Keep a list of mappings we need to remove list<Mapping*> to_remove; while (here) { if (here->position().node_id() == leaf->end().node_id() && here->position().is_reverse() != (leaf->end().backward() != backward)) { // We have encountered the end node with an incorrect orientation. cerr << "warning:[vg simplify] Path " << path_name << " doubles back through end of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; dropping!" << endl; assert(drop_hairpin_paths); kill_path = true; break; } // Say we should remove the mapping. to_remove.push_back(here); // Scan right along the path if we found the site end backwards, and left if we found it forwards. here = backward ? graph.paths.traverse_right(here) : graph.paths.traverse_left(here); // Eventually we should hit the end of the path, or the // end of the site, since we don't hit the start. } if (kill_path) { // Just go kill the whole path break; } for (auto* mapping: to_remove) { // Get rid of all the mappings once we're done tracing them out. graph.paths.remove_mapping(mapping); } } if (kill_path) { // Destroy the path completely, because it needs to reverse // inside a site that we have popped. graph.paths.remove_path(path_name); } } // Now delete all edges that aren't connecting adjacent nodes on the // blessed traversal (before we delete their nodes). set<Edge*> blessed_edges; for (int i = 0; i < traversal.visit_size() - 1; ++i) { // For each node and the next node (which won't be the end) const Visit visit = traversal.visit(i); const Visit next = traversal.visit(i); // Find the edge between them NodeTraversal here(graph.get_node(visit.node_id()), visit.backward()); NodeTraversal next_traversal(graph.get_node(next.node_id()), next.backward()); Edge* edge = graph.get_edge(here, next_traversal); assert(edge != nullptr); // Remember we need it blessed_edges.insert(edge); } // Also get the edges from the boundary nodes into the traversal if (traversal.visit_size() > 0) { NodeTraversal first_visit = to_node_traversal(traversal.visit(0), graph); NodeTraversal last_visit = to_node_traversal(traversal.visit(traversal.visit_size() - 1), graph); blessed_edges.insert(graph.get_edge(to_node_traversal(leaf->start(), graph), first_visit)); blessed_edges.insert(graph.get_edge(last_visit, to_node_traversal(leaf->end(), graph))); } else { // This is a deletion traversal, so get the edge from the start to end of the site blessed_edges.insert(graph.get_edge(to_node_traversal(leaf->start(), graph), to_node_traversal(leaf->end(), graph))); } for (auto* edge : edges) { if (!blessed_edges.count(edge)) { // Get rid of all the edges not needed for the one true traversal #ifdef debug cerr << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << ": Delete edge: " << pb2json(*edge) << endl; #endif graph.destroy_edge(edge); deleted_edges++; } } // Now delete all the nodes that aren't on the blessed traversal. // What nodes are on it? set<Node*> blessed_nodes; for (int i = 0; i < traversal.visit_size(); i++) { const Visit& visit = traversal.visit(i); blessed_nodes.insert(graph.get_node(visit.node_id())); } for (auto* node : nodes) { // For every node in the site if (!blessed_nodes.count(node)) { // If we don't need it for the chosen path, destroy it #ifdef debug cerr << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << ": Delete node: " << pb2json(*node) << endl; #endif // There may be paths still touching this node, if they // managed to get into the site without touching the start // node. We'll delete those paths. set<string> paths_to_kill; for (auto& kv : graph.paths.get_node_mapping(node)) { if (mappings_by_path.count(kv.first)) { // We've already actually updated this path; the // node_mapping data is just out of date. continue; } paths_to_kill.insert(kv.first); } for (auto& path : paths_to_kill) { graph.paths.remove_path(path); cerr << "warning:[vg simplify] Path " << path << " removed" << endl; } graph.destroy_node(node); deleted_nodes++; } } // OK we finished a leaf increment_progress(); } destroy_progress(); // Reset the ranks in the graph, since we rewrote paths graph.paths.clear_mapping_ranks(); // Return the statistics. return to_return; }