thread_t path_to_thread_t(Path& path) { thread_t t; for(size_t i = 0; i < path.mapping_size(); i++) { Mapping mapping = path.mapping(i); auto pos = mapping.position(); XG::ThreadMapping m = {pos.node_id(), pos.is_reverse()}; t.push_back(m); } return t; }
void flip_nodes(Alignment& a, set<int64_t> ids, const std::function<size_t(int64_t)>& node_length) { Path* path = a.mutable_path(); for(size_t i = 0; i < path->mapping_size(); i++) { // Grab each mapping (includes its position) Mapping* mapping = path->mutable_mapping(i); if(ids.count(mapping->position().node_id())) { // We need to flip this mapping *mapping = reverse_mapping(*mapping, node_length); } } }
/// Find the region of the Mapping's node used by the Mapping, in forward strand space, as start to past_end. static pair<size_t, size_t> mapping_to_range(const xg::XG* xg_index, const Mapping& mapping) { // How much of the node does it cover? auto mapping_length = mapping_from_length(mapping); // Work out where the start and past-end positions on the node's forward strand are. pair<size_t, size_t> node_range; if (mapping.position().is_reverse()) { // On the reverse strand we need the node length // TODO: getting it can be slow auto node_length = xg_index->node_length(mapping.position().node_id()); node_range.first = node_length - mapping.position().offset() - mapping_length; node_range.second = node_length - mapping.position().offset(); } else { // On the forward strand this is easy node_range.first = mapping.position().offset(); node_range.second = node_range.first + mapping_length; } return node_range; }
vector<int> Vectorizer::alignment_to_a_hot(Alignment a){ int64_t entity_size = my_xg->node_count + my_xg->edge_count; vector<int> ret(entity_size, 0); Path path = a.path(); for (int i = 0; i < path.mapping_size(); i++){ Mapping mapping = path.mapping(i); if(! mapping.has_position()){ continue; } Position pos = mapping.position(); int64_t node_id = pos.node_id(); int64_t key = my_xg->node_rank_as_entity(node_id); // Okay, solved the previous out of range errors: // We have to use an entity-space that is |nodes + edges + 1| // as nodes are indexed from 1, not from 0. //TODO: this means we may one day have to do the same bump up // by one for edges, as I assume they are also indexed starting at 1. //cerr << key << " - " << entity_size << endl; //Find edge by current / previous node ID // we can check the orientation, though it shouldn't **really** matter // whether we catch them in the forward or reverse direction. if (i > 0){ Mapping prev_mapping = path.mapping(i - 1); Position prev_pos = prev_mapping.position(); int64_t prev_node_id = prev_pos.node_id(); if (my_xg->has_edge(prev_node_id, false, node_id, false)){ int64_t edge_key = my_xg->edge_rank_as_entity(prev_node_id, false, node_id, false); vector<size_t> edge_paths = my_xg->paths_of_entity(edge_key); if (edge_paths.size() > 0){ ret[edge_key - 1] = 1; } else{ ret[edge_key - 1] = 2; } } } //Check if the node of interest is on a path vector<size_t> node_paths = my_xg->paths_of_node(node_id); if (node_paths.size() > 0){ ret[key - 1] = 2; } else{ ret[key - 1] = 1; } } return ret; }
vector<double> Vectorizer::alignment_to_identity_hot(Alignment a){ int64_t entity_size = my_xg->node_count + my_xg->edge_count; vector<double> ret(entity_size, 0.0); Path path = a.path(); for (int i = 0; i < path.mapping_size(); i ++){ Mapping mapping = path.mapping(i); if(! mapping.has_position()){ continue; } Position pos = mapping.position(); int64_t node_id = pos.node_id(); int64_t key = my_xg->node_rank_as_entity(node_id); //Calculate % identity by walking the edits and counting matches. double pct_id = 0.0; double match_len = 0.0; double total_len = 0.0; for (int j = 0; j < mapping.edit_size(); j++){ Edit e = mapping.edit(j); total_len += e.from_length(); if (e.from_length() == e.to_length() && e.sequence() == ""){ match_len += (double) e.to_length(); } else if (e.from_length() == e.to_length() && e.sequence() != ""){ // TODO if we map but don't match exactly, add half the average length to match_length //match_len += (double) (0.5 * ((double) e.to_length())); } else{ } } pct_id = (match_len == 0.0 && total_len == 0.0) ? 0.0 : (match_len / total_len); ret[key - 1] = pct_id; if (i > 0){ Mapping prev_mapping = path.mapping(i - 1); Position prev_pos = prev_mapping.position(); int64_t prev_node_id = prev_pos.node_id(); if (my_xg->has_edge(prev_node_id, false, node_id, false)){ int64_t edge_key = my_xg->edge_rank_as_entity(prev_node_id, false, node_id, false); ret[edge_key - 1] = 1.0; } } } return ret; }
/** * Looks for alignments that change direction over their length. * This may happen because of: * 1. Mapping artifacts * 2. Cycles * 3. Highly repetitive regions * 4. Inversions (if you're lucky enough) * * Default behavior: if the Alignment reverses, return an empty Alignment. * inverse behavior: if the Alignment reverses, return the Alignment. */ Alignment Filter::reversing_filter(Alignment& aln){ Path path = aln.path(); bool prev = false; for (int i = 1; i < path.mapping_size(); i++){ Mapping mapping = path.mapping(i); Position pos = mapping.position(); bool prev = path.mapping(i - 1).position().is_reverse(); if (prev != pos.is_reverse()){ return inverse ? aln : Alignment(); } } return inverse ? Alignment() : aln; }
/** * * Looks for alignments that transition from one path to another * over their length. This may occur for one of several reasons: * 1. The read covers a translocation * 2. The read looks a lot like two different (but highly-similar paths) * 3. The read is shattered (e.g. as in chromothripsis) * * Default behavior: if the Alignment is path divergent, return an empty Alignment, else return aln * Inverse behavior: if the Alignment is path divergent, return aln, else return an empty Alignment */ Alignment Filter::path_divergence_filter(Alignment& aln){ Path path = aln.path(); for (int i = 1; i < path.mapping_size(); i++){ Mapping mapping = path.mapping(i); Position pos = mapping.position(); id_t current_node = pos.node_id(); id_t prev_node = path.mapping(i - 1).position().node_id(); bool paths_match = false; vector<size_t> paths_of_prev = my_xg_index->paths_of_node(prev_node); for (int i = 0; i < paths_of_prev.size(); i++){ string p_name = my_xg_index->path_name(paths_of_prev[i]); if (my_xg_index->path_contains_node(p_name, current_node)){ paths_match = true; } } if (!paths_match){ return inverse ? aln : Alignment(); } } return inverse ? Alignment() : aln; }
bit_vector Vectorizer::alignment_to_onehot(Alignment a){ // Make a vector as large as the | |nodes| + |edges| | space // TODO handle edges int64_t entity_size = my_xg->node_count + my_xg->edge_count; bit_vector ret(entity_size, 0); Path path = a.path(); for (int i = 0; i < path.mapping_size(); i++){ Mapping mapping = path.mapping(i); Position pos = mapping.position(); int64_t node_id = pos.node_id(); int64_t key = my_xg->node_rank_as_entity(node_id); // Okay, solved the previous out of range errors: // We have to use an entity-space that is |nodes + edges + 1| // as nodes are indexed from 1, not from 0. //TODO: this means we may one day have to do the same bump up // by one for edges, as I assume they are also indexed starting at 1. //cerr << key << " - " << entity_size << endl; //Find edge by current / previous node ID // we can check the orientation, though it shouldn't **really** matter // whether we catch them in the forward or reverse direction. if (i > 0){ Mapping prev_mapping = path.mapping(i - 1); Position prev_pos = prev_mapping.position(); int64_t prev_node_id = prev_pos.node_id(); if (my_xg->has_edge(prev_node_id, false, node_id, false)){ int64_t edge_key = my_xg->edge_rank_as_entity(prev_node_id, false, node_id, false); ret[edge_key - 1] = 1; } } //Find entity rank of edge ret[key - 1] = 1; } return ret; }
void Pileups::compute_from_edit(NodePileup& pileup, int64_t& node_offset, int64_t& read_offset, const Node& node, const Alignment& alignment, const Mapping& mapping, const Edit& edit) { string seq = edit.sequence(); bool is_reverse = mapping.position().is_reverse(); // ***** MATCH ***** if (edit.from_length() == edit.to_length()) { assert (edit.from_length() > 0); make_match(seq, edit.from_length(), is_reverse); assert(seq.length() == edit.from_length()); int64_t delta = 1; for (int64_t i = 0; i < edit.from_length(); ++i) { BasePileup* base_pileup = get_create_base_pileup(pileup, node_offset); // reference_base if empty if (base_pileup->num_bases() == 0) { base_pileup->set_ref_base(node.sequence()[node_offset]); } else { assert(base_pileup->ref_base() == node.sequence()[node_offset]); } // add base to bases field (converting to ,. if match) char base = seq[i]; if (!edit.sequence().empty() && base_equal(seq[i], node.sequence()[node_offset], is_reverse)) { base = is_reverse ? ',' : '.'; } *base_pileup->mutable_bases() += base; // add quality if there if (!alignment.quality().empty()) { *base_pileup->mutable_qualities() += alignment.quality()[read_offset]; } // pileup size increases by 1 base_pileup->set_num_bases(base_pileup->num_bases() + 1); // move right along read, and left/right depending on strand on reference node_offset += delta; ++read_offset; } } // ***** INSERT ***** else if (edit.from_length() < edit.to_length()) { make_insert(seq, is_reverse); assert(edit.from_length() == 0); // we define insert (like sam) as insertion between current and next // position (on forward node coordinates). this means an insertion before // offset 0 is invalid! int64_t insert_offset = is_reverse ? node_offset : node_offset - 1; if (insert_offset >= 0) { BasePileup* base_pileup = get_create_base_pileup(pileup, insert_offset); // reference_base if empty if (base_pileup->num_bases() == 0) { base_pileup->set_ref_base(node.sequence()[insert_offset]); } else { assert(base_pileup->ref_base() == node.sequence()[insert_offset]); } // add insertion string to bases field // todo: should we reverse complement this if mapping is reversed ??? base_pileup->mutable_bases()->append(seq); if (!alignment.quality().empty()) { *base_pileup->mutable_qualities() += alignment.quality()[read_offset]; } // pileup size increases by 1 base_pileup->set_num_bases(base_pileup->num_bases() + 1); } else { // todo: need to either forget about these, or extend pileup format. // easy solution: change insert to come before position, and just add // optional pileup at n+1st base of node. would like to figure out // how samtools does it first... /* stringstream ss; ss << "Warning: pileup does not support insertions before 0th base in node." << " Offending edit: " << pb2json(edit) << endl; #pragma omp critical(cerr) cerr << ss.str(); */ } // move right along read (and stay put on reference) read_offset += edit.to_length(); } // ***** DELETE ***** else { assert(edit.to_length() == 0); assert(edit.sequence().empty()); int64_t del_start = !is_reverse ? node_offset : node_offset - edit.from_length() + 1; seq = node.sequence().substr(del_start, edit.from_length()); make_delete(seq, is_reverse); BasePileup* base_pileup = get_create_base_pileup(pileup, node_offset); // reference_base if empty if (base_pileup->num_bases() == 0) { base_pileup->set_ref_base(node.sequence()[node_offset]); } else { assert(base_pileup->ref_base() == node.sequence()[node_offset]); } // add deletion string to bases field // todo: should we reverse complement this if mapping is reversed ??? base_pileup->mutable_bases()->append(seq); if (!alignment.quality().empty()) { *base_pileup->mutable_qualities() += alignment.quality()[read_offset]; } // pileup size increases by 1 base_pileup->set_num_bases(base_pileup->num_bases() + 1); int64_t delta = edit.from_length(); // stay put on read, move left/right depending on strand on reference node_offset += delta; } }
void Deconstructor::sb2vcf(string outfile){ Header h; h.set_date(); h.set_source("VG"); h.set_reference(""); h.set_version("VCF4.2"); cout << h << endl; // for each superbubble: // Fill out a vcflib Variant // Check if it is masked by an input vcf // if not, print it to stdout map<id_t, vcflib::Variant> node_to_var; vcflib::VariantCallFile mask; if (!mask_file.empty()){ //node_to_var = my_vg->get_node_to_variant(mask); } for (auto s : my_sbs){ vcflib::Variant var; // Make subgraphs out of the superbubble: // Operating on a pair<id_t, id_t>, vector<id_t> // then enumerate k_paths through the SuperBubbles set<Node*> nodes; set<Edge*> edges; for (int i = 0; i < s.second.size(); i++){ id_t n_id = s.second[i]; //cerr << n_id << endl; Node* n_node = my_vg->get_node(n_id); vector<Edge*> e_end = my_vg->edges_from(n_node); nodes.insert(n_node); if (i < s.second.size() - 1){ edges.insert(e_end.begin(), e_end.end()); } } vg::VG t_graph = vg::VG(nodes, edges); vector<Path> paths; std::function<void(NodeTraversal)> no_op = [](NodeTraversal n){}; std::function<void(size_t, Path&)> extract_path = [&paths](size_t x_size, Path& path){ paths.push_back(path); }; t_graph.for_each_kpath(10000, false, 100, no_op, no_op, extract_path); std::function<std::vector<Path>(vector<Path>)> uniquify = [](vector<Path> v){ map<string, Path> unqs; vector<Path> ret; for (auto x: v){ unqs[path_to_string(x)] = x; } for (auto y : unqs){ ret.push_back(y.second); } return ret; }; paths = uniquify(paths); std::function<bool(Path)> all_ref = [&](Path p){ for (int i = 0; i < p.mapping_size(); i++){ Mapping m = p.mapping(i); Position pos = m.position(); vg::id_t pos_id = pos.node_id(); map<string, set<Mapping*> > path_to_mappings = my_vg->paths.get_node_mapping(pos_id); if (path_to_mappings.size() <= 0){ return false; } } return true; }; /* * This means we now have vectors for the superbubble * that have the paths through the nodes within it (including end nodes) * however, these paths are repeated several times. * We should find a way to prevent them being inserted once for each node. * * Next on the agenda: use the get_path_dist thing from vg call / vg stats * to get the distance to the head node. * Might need an XG index for this. * * Also need a way to deal with GAMs for this i.e. a way to * count the number of times we see something come up in the gam */ int first_len = (my_vg->get_node(1))->sequence().size(); map<string, set<Mapping*> > p_to_mappings = my_vg->paths.get_node_mapping(s.first.first); for (auto p_name : p_to_mappings){ var.sequenceName = p_name.first; } var.position = my_xg->approx_path_distance(var.sequenceName, 1, s.first.first) + (s.first.first == 1 ? 0 : first_len); //var.sequenceName = my_vg->paths.get_node_mapping(pos_id); // for (auto x : paths){ //cerr << path_to_string(x) << endl; stringstream ref_seq; stringstream alt_seq; bool is_ref = true; for (int m_i = 1; m_i < x.mapping_size() -1 ; m_i++){ Mapping m = x.mapping(m_i); id_t pos_id = m.position().node_id(); Node* n = my_vg->get_node(pos_id); string n_seq = n->sequence(); map<string, set<Mapping*> > path_to_mappings = my_vg->paths.get_node_mapping(pos_id); if (path_to_mappings.size() == 0){ is_ref = false; } if (is_ref){ ref_seq << n_seq; } alt_seq << n_seq; //cerr << " REF: " << ref_seq.str() << " ALT: " << alt_seq.str() << endl; } if (is_ref){ if(var.ref.empty()){ string ref_str = ref_seq.str(); var.ref = ref_str; //(ref_str.size() > 0) ? ref_str : (my_vg->get_node(s.first.first))->sequence(); var.alleles.insert(var.alleles.begin(), var.ref); } } else{ string alt_string = alt_seq.str(); var.alt.push_back(alt_string); var.alleles.push_back(alt_string); } } if (! (var.ref.empty() && var.alt.empty()) ){ cout << var << endl; } } }
pair<size_t, size_t> Simplifier::simplify_once(size_t iteration) { // Set up the deleted node and edge counts pair<size_t, size_t> to_return {0, 0}; auto& deleted_nodes = to_return.first; auto& deleted_edges = to_return.second; if(!graph.is_valid(true, true, true, true)) { // Make sure the graph is valid and not missing nodes or edges cerr << "error:[vg::Simplifier] Invalid graph on iteration " << iteration << endl; exit(1); } // Make a list of leaf sites list<const Snarl*> leaves; if (show_progress) { cerr << "Iteration " << iteration << ": Scanning " << graph.node_count() << " nodes and " << graph.edge_count() << " edges for sites..." << endl; } for (const Snarl* top_level_site : site_manager.top_level_snarls()) { list<const Snarl*> queue {top_level_site}; while (queue.size()) { const Snarl* site = queue.front(); queue.pop_front(); if (site_manager.is_leaf(site)) { // It's a leaf. Filter it out if it is trivial if (site->type() == ULTRABUBBLE) { auto contents = site_manager.shallow_contents(site, graph, false); if (contents.first.empty()) { // Nothing but the boundary nodes in this snarl continue; } } // Not trivial. Keep it. leaves.push_back(site); } else { for (const Snarl* child_site : site_manager.children_of(site)) { queue.push_back(child_site); } } } } if (show_progress) { cerr << "Found " << leaves.size() << " leaves" << endl; } // Index all the graph paths map<string, unique_ptr<PathIndex>> path_indexes; graph.paths.for_each_name([&](const string& name) { // For every path name, go index it and put it in this collection path_indexes.insert(make_pair(name, move(unique_ptr<PathIndex>(new PathIndex(graph, name))))); }); // Now we have a list of all the leaf sites. create_progress("simplifying leaves", leaves.size()); // We can't use the SnarlManager after we modify the graph, so we load the // contents of all the leaves we're going to modify first. map<const Snarl*, pair<unordered_set<Node*>, unordered_set<Edge*>>> leaf_contents; // How big is each leaf in bp map<const Snarl*, size_t> leaf_sizes; // We also need to pre-calculate the traversals for the snarls that are the // right size, since the traversal finder uses the snarl manager amd might // not work if we modify the graph. map<const Snarl*, vector<SnarlTraversal>> leaf_traversals; for (const Snarl* leaf : leaves) { // Look at all the leaves // Get the contents of the bubble, excluding the boundary nodes leaf_contents[leaf] = site_manager.deep_contents(leaf, graph, false); // For each leaf, calculate its total size. unordered_set<Node*>& nodes = leaf_contents[leaf].first; size_t& total_size = leaf_sizes[leaf]; for (Node* node : nodes) { // For each node include it in the size figure total_size += node->sequence().size(); } if (total_size == 0) { // This site is just the start and end nodes, so it doesn't make // sense to try and remove it. continue; } if (total_size >= min_size) { // This site is too big to remove continue; } // Identify the replacement traversal for the bubble if it's the right size. // We can't necessarily do this after we've modified the graph. vector<SnarlTraversal>& traversals = leaf_traversals[leaf]; traversals = traversal_finder.find_traversals(*leaf); } for (const Snarl* leaf : leaves) { // Look at all the leaves // Get the contents of the bubble, excluding the boundary nodes unordered_set<Node*>& nodes = leaf_contents[leaf].first; unordered_set<Edge*>& edges = leaf_contents[leaf].second; // For each leaf, grab its total size. size_t& total_size = leaf_sizes[leaf]; if (total_size == 0) { // This site is just the start and end nodes, so it doesn't make // sense to try and remove it. continue; } if (total_size >= min_size) { // This site is too big to remove continue; } #ifdef debug cerr << "Found " << total_size << " bp leaf" << endl; for (auto* node : nodes) { cerr << "\t" << node->id() << ": " << node->sequence() << endl; } #endif // Otherwise we want to simplify this site away // Grab the replacement traversal for the bubble vector<SnarlTraversal>& traversals = leaf_traversals[leaf]; if (traversals.empty()) { // We couldn't find any paths through the site. continue; } // Get the traversal out of the vector SnarlTraversal& traversal = traversals.front(); // Determine the length of the new traversal size_t new_site_length = 0; for (size_t i = 1; i < traversal.visit_size() - 1; i++) { // For every non-anchoring node const Visit& visit = traversal.visit(i); // Total up the lengths of all the nodes that are newly visited. assert(visit.node_id()); new_site_length += graph.get_node(visit.node_id())->sequence().size(); } #ifdef debug cerr << "Chosen traversal is " << new_site_length << " bp" << endl; #endif // Now we have to rewrite paths that visit nodes/edges not on this // traversal, or in a different order, or whatever. To be safe we'll // just rewrite all paths. // Find all the paths that traverse this region. // We start at the start node. Copy out all the mapping pointers on that // node, so we can go through them while tampering with them. map<string, set<Mapping*> > mappings_by_path = graph.paths.get_node_mapping(graph.get_node(leaf->start().node_id())); // It's possible a path can enter the site through the end node and // never hit the start. So we're going to trim those back before we delete nodes and edges. map<string, set<Mapping*> > end_mappings_by_path = graph.paths.get_node_mapping(graph.get_node(leaf->end().node_id())); if (!drop_hairpin_paths) { // We shouldn't drop paths if they hairpin and can't be represented // in a simplified bubble. So we instead have to not simplify // bubbles that would have that problem. bool found_hairpin = false; for (auto& kv : mappings_by_path) { // For each path that hits the start node if (found_hairpin) { // We only care if there are 1 or more hairpins, not how many break; } // Unpack the name auto& path_name = kv.first; for (Mapping* start_mapping : kv.second) { // For each visit to the start node if (found_hairpin) { // We only care if there are 1 or more hairpins, not how many break; } // Determine what orientation we're going to scan in bool backward = start_mapping->position().is_reverse(); // Start at the start node Mapping* here = start_mapping; while (here) { // Until we hit the start/end of the path or the mapping we want if (here->position().node_id() == leaf->end().node_id() && here->position().is_reverse() == (leaf->end().backward() != backward)) { // We made it out. // Stop scanning! break; } if (here->position().node_id() == leaf->start().node_id() && here->position().is_reverse() != (leaf->start().backward() != backward)) { // We have encountered the start node with an incorrect orientation. cerr << "warning:[vg simplify] Path " << path_name << " doubles back through start of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; skipping site!" << endl; found_hairpin = true; break; } // Scan left along ther path if we found the site start backwards, and right if we found it forwards. here = backward ? graph.paths.traverse_left(here) : graph.paths.traverse_right(here); } } } for (auto& kv : end_mappings_by_path) { // For each path that hits the end node if (found_hairpin) { // We only care if there are 1 or more hairpins, not how many break; } // Unpack the name auto& path_name = kv.first; for (Mapping* end_mapping : kv.second) { if (found_hairpin) { // We only care if there are 1 or more hairpins, not how many break; } // Determine what orientation we're going to scan in bool backward = end_mapping->position().is_reverse(); // Start at the end Mapping* here = end_mapping; while (here) { if (here->position().node_id() == leaf->start().node_id() && here->position().is_reverse() == (leaf->start().backward() != backward)) { // We made it out. // Stop scanning! break; } if (here->position().node_id() == leaf->end().node_id() && here->position().is_reverse() != (leaf->end().backward() != backward)) { // We have encountered the end node with an incorrect orientation. cerr << "warning:[vg simplify] Path " << path_name << " doubles back through end of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; dropping site!" << endl; found_hairpin = true; break; } // Scan right along the path if we found the site end backwards, and left if we found it forwards. here = backward ? graph.paths.traverse_right(here) : graph.paths.traverse_left(here); } } } if (found_hairpin) { // We found a hairpin, so we want to skip the site. cerr << "warning:[vg simplify] Site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << " skipped due to hairpin path." << endl; continue; } } // We'll keep a set of the end mappings we managed to find, starting from the start set<Mapping*> found_end_mappings; for (auto& kv : mappings_by_path) { // For each path that hits the start node // Unpack the name auto& path_name = kv.first; // If a path can't be represented after a bubble is popped // (because the path reversed and came out the same side as it // went in), we just clobber the path entirely. TODO: handle // out-the-same-side traversals as valid genotypes somehow.. bool kill_path = false; for (Mapping* start_mapping : kv.second) { // For each visit to the start node // Determine what orientation we're going to scan in bool backward = start_mapping->position().is_reverse(); // We're going to fill this list with the mappings we need to // remove and replace in this path for this traversal. Initially // runs from start of site to end of site, but later gets // flipped into path-local orientation. list<Mapping*> existing_mappings; // Tracing along forward/backward from each as appropriate, see // if the end of the site is found in the expected orientation // (or if the path ends first). bool found_end = false; Mapping* here = start_mapping; // We want to remember the end mapping when we find it Mapping* end_mapping = nullptr; #ifdef debug cerr << "Scanning " << path_name << " from " << pb2json(*here) << " for " << to_node_traversal(leaf->end(), graph) << " orientation " << backward << endl; #endif while (here) { // Until we hit the start/end of the path or the mapping we want #ifdef debug cerr << "\tat " << pb2json(*here) << endl; #endif if (here->position().node_id() == leaf->end().node_id() && here->position().is_reverse() == (leaf->end().backward() != backward)) { // We have encountered the end of the site in the // orientation we expect, given the orientation we saw // for the start. found_end = true; end_mapping = here; // Know we got to this mapping at the end from the // start, so we don't need to clobber everything // before it. found_end_mappings.insert(here); // Stop scanning! break; } if (here->position().node_id() == leaf->start().node_id() && here->position().is_reverse() != (leaf->start().backward() != backward)) { // We have encountered the start node with an incorrect orientation. cerr << "warning:[vg simplify] Path " << path_name << " doubles back through start of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; dropping!" << endl; assert(drop_hairpin_paths); kill_path = true; break; } if (!nodes.count(graph.get_node(here->position().node_id()))) { // We really should stay inside the site! cerr << "error:[vg simplify] Path " << path_name << " somehow escapes site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << endl; exit(1); } if (here != start_mapping) { // Remember the mappings that aren't to the start or // end of the site, so we can remove them later. existing_mappings.push_back(here); } // Scan left along ther path if we found the site start backwards, and right if we found it forwards. Mapping* next = backward ? graph.paths.traverse_left(here) : graph.paths.traverse_right(here); if (next == nullptr) { // We hit the end of the path without finding the end of the site. // We've found all the existing mappings, so we can stop. break; } // Make into NodeTraversals NodeTraversal here_traversal(graph.get_node(here->position().node_id()), here->position().is_reverse()); NodeTraversal next_traversal(graph.get_node(next->position().node_id()), next->position().is_reverse()); if (backward) { // We're scanning the other way std::swap(here_traversal, next_traversal); } // Make sure we have an edge so we can traverse this node and then the node we're going to. if(graph.get_edge(here_traversal, next_traversal) == nullptr) { cerr << "error:[vg::Simplifier] No edge " << here_traversal << " to " << next_traversal << endl; exit(1); } here = next; } if (kill_path) { // This path can't exist after we pop this bubble. break; } if (!found_end) { // This path only partly traverses the site, and is // anchored at the start. Remove the part inside the site. // TODO: let it stay if it matches the one true traversal. for(auto* mapping : existing_mappings) { // Trim the path out of the site graph.paths.remove_mapping(mapping); } // TODO: update feature positions if we trim off the start of a path // Maybe the next time the path visits the site it will go // all the way through. continue; } // If we found the end, remove all the mappings encountered, in // order so that the last one removed is the last one along the // path. if (backward) { // Make sure the last mapping in the list is the last // mapping to occur along the path. existing_mappings.reverse(); } // Where does the variable region of the site start for this // traversal of the path? If there are no existing mappings, // it's the start mapping's position if we traverse the site // backwards and the end mapping's position if we traverse // the site forwards. If there are existing mappings, it's // the first existing mapping's position in the path. TODO: // This is super ugly. Can we view the site in path // coordinates or something? PathIndex& path_index = *path_indexes.at(path_name).get(); Mapping* mapping_after_first = existing_mappings.empty() ? (backward ? start_mapping : end_mapping) : existing_mappings.front(); assert(path_index.mapping_positions.count(mapping_after_first)); size_t variable_start = path_index.mapping_positions.at(mapping_after_first); // Determine the total length of the old traversal of the site size_t old_site_length = 0; for (auto* mapping : existing_mappings) { // Add in the lengths of all the mappings that will get // removed. old_site_length += mapping_from_length(*mapping); } #ifdef debug cerr << "Replacing " << old_site_length << " bp at " << variable_start << " with " << new_site_length << " bp" << endl; #endif // Actually update any BED features features.on_path_edit(path_name, variable_start, old_site_length, new_site_length); // Where will we insert the new site traversal into the path? list<Mapping>::iterator insert_position; if (!existing_mappings.empty()) { // If there are existing internal mappings, we'll insert right where they were for (auto* mapping : existing_mappings) { // Remove each mapping from left to right along the // path, saving the position after the mapping we just // removed. At the end we'll have the position of the // mapping to the end of the site. #ifdef debug cerr << path_name << ": Drop mapping " << pb2json(*mapping) << endl; #endif insert_position = graph.paths.remove_mapping(mapping); } } else { // Otherwise we'll insert right before the mapping to // the start or end of the site (whichever occurs last // along the path) insert_position = graph.paths.find_mapping(backward ? start_mapping : here); } // Make sure we're going to insert starting from the correct end of the site. if (backward) { assert(insert_position->position().node_id() == leaf->start().node_id()); } else { assert(insert_position->position().node_id() == leaf->end().node_id()); } // Loop through the internal visits in the canonical // traversal backwards along the path we are splicing. If // it's a forward path this is just right to left, but if // it's a reverse path it has to be left to right. for (size_t i = 0; i < traversal.visit_size(); i++) { // Find the visit we need next, as a function of which // way we need to insert this run of visits. Normally we // go through the visits right to left, but when we have // a backward path we go left to right. const Visit& visit = backward ? traversal.visit(i) : traversal.visit(traversal.visit_size() - i - 1); // Make a Mapping to represent it Mapping new_mapping; new_mapping.mutable_position()->set_node_id(visit.node_id()); // We hit this node backward if it's backward along the // traversal, xor if we are traversing the traversal // backward new_mapping.mutable_position()->set_is_reverse(visit.backward() != backward); // Add an edit Edit* edit = new_mapping.add_edit(); size_t node_seq_length = graph.get_node(visit.node_id())->sequence().size(); edit->set_from_length(node_seq_length); edit->set_to_length(node_seq_length); #ifdef debug cerr << path_name << ": Add mapping " << pb2json(new_mapping) << endl; #endif // Insert the mapping in the path, moving right to left insert_position = graph.paths.insert_mapping(insert_position, path_name, new_mapping); } // Now we've corrected this site on this path. Update its index. // TODO: right now this means retracing the entire path. path_indexes[path_name].get()->update_mapping_positions(graph, path_name); } if (kill_path) { // Destroy the path completely, because it needs to reverse // inside a site that we have popped. graph.paths.remove_path(path_name); } } for (auto& kv : end_mappings_by_path) { // Now we handle the end mappings not reachable from the start. For each path that touches the end... // Unpack the name auto& path_name = kv.first; // We might have to kill the path, if it reverses inside a // bubble we're popping bool kill_path = false; for (Mapping* end_mapping : kv.second) { if (found_end_mappings.count(end_mapping)) { // Skip the traversals of the site that we handled. continue; } // Now we're left with paths that leave the site but don't // enter. We're going to clobber everything before the path // leaves the site. // Determine what orientation we're going to scan in bool backward = end_mapping->position().is_reverse(); // Start at the end Mapping* here = end_mapping; // Keep a list of mappings we need to remove list<Mapping*> to_remove; while (here) { if (here->position().node_id() == leaf->end().node_id() && here->position().is_reverse() != (leaf->end().backward() != backward)) { // We have encountered the end node with an incorrect orientation. cerr << "warning:[vg simplify] Path " << path_name << " doubles back through end of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; dropping!" << endl; assert(drop_hairpin_paths); kill_path = true; break; } // Say we should remove the mapping. to_remove.push_back(here); // Scan right along the path if we found the site end backwards, and left if we found it forwards. here = backward ? graph.paths.traverse_right(here) : graph.paths.traverse_left(here); // Eventually we should hit the end of the path, or the // end of the site, since we don't hit the start. } if (kill_path) { // Just go kill the whole path break; } for (auto* mapping: to_remove) { // Get rid of all the mappings once we're done tracing them out. graph.paths.remove_mapping(mapping); } } if (kill_path) { // Destroy the path completely, because it needs to reverse // inside a site that we have popped. graph.paths.remove_path(path_name); } } // Now delete all edges that aren't connecting adjacent nodes on the // blessed traversal (before we delete their nodes). set<Edge*> blessed_edges; for (int i = 0; i < traversal.visit_size() - 1; ++i) { // For each node and the next node (which won't be the end) const Visit visit = traversal.visit(i); const Visit next = traversal.visit(i); // Find the edge between them NodeTraversal here(graph.get_node(visit.node_id()), visit.backward()); NodeTraversal next_traversal(graph.get_node(next.node_id()), next.backward()); Edge* edge = graph.get_edge(here, next_traversal); assert(edge != nullptr); // Remember we need it blessed_edges.insert(edge); } // Also get the edges from the boundary nodes into the traversal if (traversal.visit_size() > 0) { NodeTraversal first_visit = to_node_traversal(traversal.visit(0), graph); NodeTraversal last_visit = to_node_traversal(traversal.visit(traversal.visit_size() - 1), graph); blessed_edges.insert(graph.get_edge(to_node_traversal(leaf->start(), graph), first_visit)); blessed_edges.insert(graph.get_edge(last_visit, to_node_traversal(leaf->end(), graph))); } else { // This is a deletion traversal, so get the edge from the start to end of the site blessed_edges.insert(graph.get_edge(to_node_traversal(leaf->start(), graph), to_node_traversal(leaf->end(), graph))); } for (auto* edge : edges) { if (!blessed_edges.count(edge)) { // Get rid of all the edges not needed for the one true traversal #ifdef debug cerr << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << ": Delete edge: " << pb2json(*edge) << endl; #endif graph.destroy_edge(edge); deleted_edges++; } } // Now delete all the nodes that aren't on the blessed traversal. // What nodes are on it? set<Node*> blessed_nodes; for (int i = 0; i < traversal.visit_size(); i++) { const Visit& visit = traversal.visit(i); blessed_nodes.insert(graph.get_node(visit.node_id())); } for (auto* node : nodes) { // For every node in the site if (!blessed_nodes.count(node)) { // If we don't need it for the chosen path, destroy it #ifdef debug cerr << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << ": Delete node: " << pb2json(*node) << endl; #endif // There may be paths still touching this node, if they // managed to get into the site without touching the start // node. We'll delete those paths. set<string> paths_to_kill; for (auto& kv : graph.paths.get_node_mapping(node)) { if (mappings_by_path.count(kv.first)) { // We've already actually updated this path; the // node_mapping data is just out of date. continue; } paths_to_kill.insert(kv.first); } for (auto& path : paths_to_kill) { graph.paths.remove_path(path); cerr << "warning:[vg simplify] Path " << path << " removed" << endl; } graph.destroy_node(node); deleted_nodes++; } } // OK we finished a leaf increment_progress(); } destroy_progress(); // Reset the ranks in the graph, since we rewrote paths graph.paths.clear_mapping_ranks(); // Return the statistics. return to_return; }
Alignment Filter::depth_filter(Alignment& aln){ if (use_avg && window_length != 0){ } else if (use_avg != 0){ } else{ } Path path = aln.path(); //TODO handle reversing mappings vector<int>* qual_window; if (window_length > 0){ qual_window = new vector<int>(); } for (int i = 0; i < path.mapping_size(); i++){ Mapping mapping = path.mapping(i); Position start_pos = mapping.position(); int64_t start_node = start_pos.node_id(); int64_t start_offset = start_pos.offset(); int64_t curr_offset_in_graph = 0; int64_t curr_offset_in_alignment = 0; stringstream pst; pst << start_node << "_" << curr_offset_in_graph; string p_hash = pst.str(); for (int j = 0; j < mapping.edit_size(); j++){ Edit ee = mapping.edit(j); if (ee.from_length() == ee.to_length() && ee.sequence() == ""){ if (!filter_matches){ continue; } } stringstream est; est << ee.from_length() << "_" << ee.to_length() << "_" + ee.sequence(); string e_hash = est.str(); #pragma omp critical(write) pos_to_edit_to_depth[p_hash][e_hash] += 1; /** * If an edit fails the filter, either return a new empty alignment * OR * return a new alignment identical to the old one EXCEPT where * the offending edit has been replaced by a match to the reference. */ if (pos_to_edit_to_depth[p_hash][e_hash] < min_depth){ if (!remove_failing_edits){ return inverse ? aln : Alignment(); } else { Alignment edited_aln = Alignment(aln); edited_aln.mutable_path()->mutable_mapping(i)->mutable_edit(j)->set_sequence(""); edited_aln.mutable_path()->mutable_mapping(i)->mutable_edit(j)->set_from_length(ee.from_length()); edited_aln.mutable_path()->mutable_mapping(i)->mutable_edit(j)->set_to_length(ee.from_length()); return edited_aln; } } } return inverse ? Alignment() : aln; } }