PathIndex::PathIndex(const xg::XG& index, const string& path_name, bool extract_sequence) { // Make sure the path is present assert(index.path_rank(path_name) != 0); if (extract_sequence) { // Constructor dispatch hack *this = PathIndex(index.path(path_name), index); } else { *this = PathIndex(index.path(path_name)); } }
void rectangle::simple_extend(thread_t& extension, xg::XG& graph, int delta_start = 0, int delta_end = 0) { if(extension.size() > 0) { xg::XG::ThreadMapping next_node = extension.back(); int64_t next_side = graph.id_to_rank(next_node.node_id) * 2 + next_node.is_reverse; state.current_side = next_side; } state.range_start -= delta_start; state.range_end -= delta_end; }
bool check_for_edges(int64_t old_node_id, bool old_node_is_reverse, int64_t new_node_id, bool new_node_is_reverse, xg::XG& index) { // What edge are we following Edge edge_taken = make_edge(old_node_id, old_node_is_reverse, new_node_id, new_node_is_reverse); // Make sure we find it bool edge_found = false; vector<Edge> edges = new_node_is_reverse ? index.edges_on_end(new_node_id) : index.edges_on_start(new_node_id); for(auto& edge : edges) { // Look at every edge in order. if(edges_equivalent(edge, edge_taken)) { // If we found the edge we're taking, break. edge_found = true; break; } } if(edge_found == false) { cerr << "did not find edge between" << old_node_id << " and " << new_node_id << endl;} return edge_found; }
void haplo_d::seeded_log_calculate_Is(xg::XG& graph) { // Things which were calculated in the constructor: // -- A's // -- J for the top continuing and any new rectangle // -- I for any new rectangle // ostream& stream = cout; vector<Edge> edges_out; vector<Edge> edges_in; for(int b = 1; b < cs.size(); b++) { vector<rectangle>& prevAs = cs[b-1].S; vector<rectangle>& currAs = cs[b].S; // xg::XG::ThreadMapping lastnode; // if(cs[b-1].bridge.size() == 0) { // lastnode = cs[b-1].get_node(); // } else { // lastnode = cs[b-1].bridge.back(); // } // edges_out = lastnode.is_reverse ? graph.edges_on_start(lastnode.node_id) : graph.edges_on_end(lastnode.node_id); edges_out = cs[b-1].get_last_node().is_reverse ? graph.edges_on_start(cs[b-1].get_last_node().node_id) : graph.edges_on_end(cs[b-1].get_last_node().node_id); edges_in = cs[b].get_node().is_reverse ? graph.edges_on_end(cs[b].get_node().node_id) : graph.edges_on_start(cs[b].get_node().node_id); bool new_threads = (prevAs[0].next == 1); // make sure that there is at least one rectangle here if(prevAs.size() == 0) { cerr << "[vg haplo error] no consistent haplotypes at node " << cs[b-1].get_node().node_id << endl; } else if(prevAs.size() == 1) { currAs.back().I = currAs.back().J; // currAs has size at most 2 if(currAs.size() == 2) { currAs[0].I = currAs[0].J - currAs[1].J; } } else if(prevAs.size() >= 2) { // We're going to have to extend, so let's grab the next node XG::ThreadMapping next_node = cs[b].get_node(); // Let's also grab the nodes which we'll skip over between this and the last node thread_t extension = cs[b-1].bridge; extension.push_back(next_node); // if J = 0 for a rectangle, then J must be 0 for all older rectangles if(currAs.back().J == 0) { currAs.pop_back(); } else { int deltaJ = prevAs[0].J - currAs[prevAs[0].next].J; if(deltaJ == 0) { // cerr << b << ", deltaJ = 0" << endl; currAs[prevAs[0].next].I = prevAs[0].I; int delta_start = prevAs[0].state.range_start - currAs[prevAs[0].next].state.range_start; int delta_end = prevAs[0].state.range_end - currAs[prevAs[0].next].state.range_end; for(int a = 1; a < prevAs.size(); a++) { rectangle new_rect = prevAs[a]; new_rect.simple_extend(extension, graph, delta_start, delta_end); new_rect.prev = a; currAs.push_back(new_rect); prevAs[a].next = currAs.size()-1; } } else { vector<int> previously_big; int big_cutoff = 400; for(int i = 1; i < prevAs.size(); i++) { if(prevAs[i].I >= big_cutoff) { previously_big.push_back(i); } } // cerr << "made big list, it's size " << previously_big.size() << endl; vector<rectangle> big_rectangles; vector<int> big_deltas; vector<int> big_Js; for(int i = 0; i < previously_big.size(); i++) { big_rectangles.push_back(prevAs[previously_big[i]]); int Jbig = big_rectangles.back().get_next_J(extension,graph, edges_in, edges_out); // cerr << Jbig << "\t" << flush; big_Js.push_back(Jbig); big_deltas.push_back(prevAs[previously_big[i]].J - Jbig); if(Jbig == 0) { break; } } // cerr << endl; // cerr << "collected attributes of big rectangles, " << big_Js.size() << " are nonempty" << endl; if(big_Js.size() > 0) { int Aabove = 0; int Jabove = currAs[prevAs[0].next].J; int dJabove = prevAs[0].J - currAs[prevAs[0].next].J; for(int i = 0; i < previously_big.size(); i++) { if(big_Js[i] == Jabove) { // all rectangles between are actually empty prevAs[currAs.back().prev].next = -1; currAs.pop_back(); } else { binaryI(graph, extension, b, Aabove, previously_big[i], dJabove, big_deltas[i], Jabove, big_Js[i], 0, edges_in, edges_out); } Aabove = previously_big[i]; Jabove = big_Js[i]; dJabove = big_deltas[i]; if(big_Js[i] == 0) { // Don't build smaller rectangles // Don't add this rectangle break; } else { big_rectangles[i].prev = previously_big[i]; currAs.push_back(big_rectangles[i]); prevAs[previously_big[i]].next = currAs.size()-1; } } if(big_Js.back() != 0) { binaryI(graph, extension, b, previously_big.back(), prevAs.size(), big_deltas.back(), 0, big_Js.back(), 0, 0, edges_in, edges_out); } } else { binaryI(graph, extension, b, 0, prevAs.size(), deltaJ, 0, currAs[prevAs[0].next].J, 0, 0, edges_in, edges_out); } for(int a = 0; a < currAs.size() - 1; a++) { currAs[a].I = currAs[a].J - currAs[a+1].J; } currAs.back().I = currAs.back().J; } } } } }
void haplo_d::log_calculate_Is(xg::XG& graph) { // Things which were calculated in the constructor: // -- A's // -- J for the top continuing and any new rectangle // -- I for any new rectangle // ostream& stream = cout; vector<Edge> edges_out; vector<Edge> edges_in; for(int b = 1; b < cs.size(); b++) { // xg::XG::ThreadMapping lastnode; // if(cs[b-1].bridge.size() == 0) { // lastnode = cs[b-1].get_node(); // } else { // lastnode = cs[b-1].bridge.back(); // } edges_out = cs[b-1].get_last_node().is_reverse ? graph.edges_on_start(cs[b-1].get_last_node().node_id) : graph.edges_on_end(cs[b-1].get_last_node().node_id); // edges_out = lastnode.is_reverse ? graph.edges_on_start(lastnode.node_id) : graph.edges_on_end(lastnode.node_id); edges_in = cs[b].get_node().is_reverse ? graph.edges_on_end(cs[b].get_node().node_id) : graph.edges_on_start(cs[b].get_node().node_id); vector<rectangle>& prevAs = cs[b-1].S; vector<rectangle>& currAs = cs[b].S; bool new_threads = (prevAs[0].next == 1); // make sure that there is at least one rectangle here if(prevAs.size() == 0) { cerr << "[vg haplo error] no consistent haplotypes at node " << cs[b-1].get_node().node_id << endl; } else if(prevAs.size() == 1) { currAs.back().I = currAs.back().J; // currAs has size at most 2 if(currAs.size() == 2) { currAs[0].I = currAs[0].J - currAs[1].J; } } else if(prevAs.size() >= 2) { // We're going to have to extend, so let's grab the next node XG::ThreadMapping next_node = cs[b].get_node(); // Let's also grab the nodes which we'll skip over between this and the last node thread_t extension = cs[b-1].bridge; extension.push_back(next_node); // if J = 0 for a rectangle, then J must be 0 for all older rectangles if(currAs.back().J == 0) { currAs.pop_back(); } else { int deltaJ = prevAs[0].J - currAs[prevAs[0].next].J; if(deltaJ == 0) { // cerr << b << ", deltaJ = 0" << endl; currAs[prevAs[0].next].I = prevAs[0].I; int delta_start = prevAs[0].state.range_start - currAs[prevAs[0].next].state.range_start; int delta_end = prevAs[0].state.range_end - currAs[prevAs[0].next].state.range_end; for(int a = 1; a < prevAs.size(); a++) { rectangle new_rect = prevAs[a]; new_rect.simple_extend(extension, graph, delta_start, delta_end); new_rect.prev = a; currAs.push_back(new_rect); prevAs[a].next = currAs.size()-1; } } else { // binaryI(XG&, thread_t, b, atop, abott, dJtop, dJbott, Jtop, Jbott, indent level) binaryI(graph, extension, b, 0, prevAs.size(), deltaJ, 0, currAs[prevAs[0].next].J, 0, 0, edges_in, edges_out); for(int a = 0; a < currAs.size() - 1; a++) { currAs[a].I = currAs[a].J - currAs[a+1].J; } currAs.back().I = currAs.back().J; } } } } }
void rectangle::simple_extend(xg::XG::ThreadMapping next_node, xg::XG& graph, int delta_start = 0, int delta_end = 0) { int64_t next_side = graph.id_to_rank(next_node.node_id) * 2 + next_node.is_reverse; state.current_side = next_side; state.range_start -= delta_start; state.range_end -= delta_end; }
haplo_d recombine_arms(haplo_d& left, haplo_d& right, int left_cut, int right_join, xg::XG& graph) { haplo_d to_return; if(!right.has_joining_node(right_join)) { return to_return; } else { vector<rectangle*> boundary = right.trace_strip(right_join, 0, -1); rectangle rect = left.cs[left_cut].S[0]; thread_t extension = left.cs[left_cut].bridge; int lastJ = rect.J; vector<int> boundaryDeltas; vector<int> boundaryJs; for(int i = 0; i < boundary.size(); i++) { to_return.cs.push_back(right.cs[right_join + i].cs_shell()); extension.push_back(to_return.cs[i].get_node()); int new_J = rect.get_next_J(extension,graph); boundaryDeltas.push_back(new_J - lastJ); if(new_J > 0) { boundaryJs.push_back(new_J); } else { break; } if(boundary[i]->J - new_J > 0) { rectangle joiners; if(i > 0) { joiners.prev = 0; to_return.cs[i-1].S[0].next = 0; } else { joiners.prev = -1; } joiners.J = boundary[i]->J; joiners.I = boundary[i]->J - new_J; to_return.cs[i].S.push_back(joiners); } if(new_J > 0) { rectangle continuing; continuing.J = new_J; if(i > 0) { continuing.prev = to_return.cs[i-1].S.size() - 1; to_return.cs[i-1].S.back().next = to_return.cs[i].S.size(); } to_return.cs[i].S.push_back(continuing); } lastJ = new_J; extension = left.cs[left_cut].bridge; } for(int i = 0; i < left.cs[left_cut].S.size(); i++) { //build first column } vector<Edge> edges_out; vector<Edge> edges_in; for(int i = 1; i < boundaryJs.size(); i++) { edges_out = to_return.cs[i-1].get_last_node().is_reverse ? graph.edges_on_start(to_return.cs[i-1].get_last_node().node_id) : graph.edges_on_end(to_return.cs[i-1].get_last_node().node_id); edges_in = to_return.cs[i].get_node().is_reverse ? graph.edges_on_end(to_return.cs[i].get_node().node_id) : graph.edges_on_start(to_return.cs[i].get_node().node_id); to_return.binaryI(graph, extension, i, to_return.cs[i].S.back().prev, to_return.cs[i-1].S.size(), boundaryDeltas[i], 0, boundaryJs[i], 0, 0, edges_in, edges_out); for(int a = 0; a < to_return.cs[i].S.size() - 1; a++) { to_return.cs[i].S[a].I = to_return.cs[i].S[a].J - to_return.cs[i].S[a+1].J; } to_return.cs[i].S.back().I = to_return.cs[i].S.back().J; } } }
//TODO: translate this void haplo_d::initialize_skeleton(thread_t& t, pair<int,int> interval, cross_section& prevAs, xg::XG& graph) { rectangle rect; int new_height; int last_height = prevAs.S[0].J; bool add_rectangle; bool add_A; //TODO: fix this int width = 0; for(int i = interval.first; i <= interval.second; i++) { // Count the number of base pairs since the last entry or exit node width += graph.node_length(t[i-1].node_id); new_height = graph.node_height(t[i]); if(cs.back().S.size() != 0) { if(i == interval.first) { prevAs.S[0]; } else { rect = cs.back().S[0]; } rect.J = rect.get_next_J(t[i],graph); // step this strip forward // Did any threads leave? if(last_height > rect.J) { add_A = 1; } // Are there any threads here which didn't come from the previous node? if(rect.J < new_height) { add_rectangle = 1; add_A = 1; } // This is an entry or exit node, add a cross-section to the vector of // cross-sections (which corresponds to the "A" set in the theory doc) if(add_A) { cs.back().width = width; width = 0; cs.push_back(cross_section(new_height,i,t[i])); } else { // This isn't a node where anything leaves or joins, let's skip over it cs.back().bridge.push_back(t[i]); for (size_t a = 0; a < cs.back().S.size(); a++) { cs.back().S[a].extend(t[i],graph); } } // This is an entry node; we also need a new rectangle corresponding to the // new strip. We need to do this *before* we populate since cross_sections // arrange rectangles newest -> oldest // NB that add_rectangle implies add_A if(add_rectangle) { rectangle new_rect; new_rect.extend(t[i],graph); new_rect.J = new_height; cs.back().height = new_rect.J; cs.back().S.push_back(new_rect); cs.back().S.back().I = new_rect.J - rect.J; } if(add_A) { int b = cs.size()-1; if(rect.J > 0) { cs[b].S.push_back(rect); cs[b].S.back().prev = 0; cs[b-1].S[0].next = cs[b].S.size()-1; } } last_height = new_height; add_A = 0; add_rectangle = 0; } else { cs.back().width = width; width = 0; cs.push_back(cross_section(new_height,i,t[i])); if(new_height > 0) { rectangle new_rect; new_rect.extend(t[i],graph); new_rect.J = new_height; cs.back().height = new_rect.J; cs.back().S.push_back(new_rect); cs.back().S.back().I = new_rect.J - rect.J; } } } if(cs.size() == 1) { cs.back().width = width; } cs.back().width += graph.node_length(t.back().node_id) - 1; for(int i = 0; i < cs.size(); i++) { tot_width += cs[i].width; } }
PathIndex::PathIndex(const Path& path, const xg::XG& index) { // Trace the given path in the given XG graph, collecting sequence // We're going to build the sequence string std::stringstream seq_stream; // What base are we at in the path? size_t path_base = 0; // What was the last rank? Ranks must always go up. int64_t last_rank = -1; for (size_t i = 0; i < path.mapping_size(); i++) { auto& mapping = path.mapping(i); if (!by_id.count(mapping.position().node_id())) { // This is the first time we have visited this node in the path. // Add in a mapping. by_id[mapping.position().node_id()] = std::make_pair(path_base, mapping.position().is_reverse()); #ifdef debug #pragma omp critical (cerr) std::cerr << "Node " << mapping.position().node_id() << " rank " << mapping.rank() << " starts at base " << path_base << " with " << index.node_sequence(mapping.position().node_id()) << std::endl; #endif // Make sure ranks are monotonically increasing along the path, or // unset. assert(mapping.rank() > last_rank || (mapping.rank() == 0 && last_rank == 0)); last_rank = mapping.rank(); } // Say that this node appears here along the reference in this // orientation. by_start[path_base] = NodeSide(mapping.position().node_id(), mapping.position().is_reverse()); // Remember that occurrence by node ID. node_occurrences[mapping.position().node_id()].push_back(by_start.find(path_base)); // Find the node's sequence std::string node_sequence = index.node_sequence(mapping.position().node_id()); while(path_base == 0 && node_sequence.size() > 0 && (node_sequence[0] != 'A' && node_sequence[0] != 'T' && node_sequence[0] != 'C' && node_sequence[0] != 'G' && node_sequence[0] != 'N')) { // If the path leads with invalid characters (like "X"), throw them // out when computing path positions. // TODO: this is a hack to deal with the debruijn-brca1-k63 graph, // which leads with an X. #pragma omp critical (cerr) std::cerr << "Warning: dropping invalid leading character " << node_sequence[0] << " from node " << mapping.position().node_id() << std::endl; node_sequence.erase(node_sequence.begin()); } if (mapping.position().is_reverse()) { // Put the reverse sequence in the path seq_stream << reverse_complement(node_sequence); } else { // Put the forward sequence in the path seq_stream << node_sequence; } // Whether we found the right place for this node in the reference or // not, we still need to advance along the reference path. We assume the // whole node (except any leading bogus characters) is included in the // path (since it sort of has to be, syntactically, unless it's the // first or last node). path_base += node_sequence.size(); // TODO: handle leading bogus characters in calls on the first node. } // Record the length of the last mapping's node, since there's no next mapping to work it out from last_node_length = path.mapping_size() > 0 ? index.node_length(path.mapping(path.mapping_size() - 1).position().node_id()) : 0; // Create the actual reference sequence we will use sequence = seq_stream.str(); #ifdef debug // Announce progress. #pragma omp critical (cerr) std::cerr << "Traced " << path_base << " bp path." << std::endl; if (sequence.size() < 100) { #pragma omp critical (cerr) std::cerr << "Sequence: " << sequence << std::endl; } #endif }