PathIndex::PathIndex(const Path& path, const xg::XG& index) { // Trace the given path in the given XG graph, collecting sequence // We're going to build the sequence string std::stringstream seq_stream; // What base are we at in the path? size_t path_base = 0; // What was the last rank? Ranks must always go up. int64_t last_rank = -1; for (size_t i = 0; i < path.mapping_size(); i++) { auto& mapping = path.mapping(i); if (!by_id.count(mapping.position().node_id())) { // This is the first time we have visited this node in the path. // Add in a mapping. by_id[mapping.position().node_id()] = std::make_pair(path_base, mapping.position().is_reverse()); #ifdef debug #pragma omp critical (cerr) std::cerr << "Node " << mapping.position().node_id() << " rank " << mapping.rank() << " starts at base " << path_base << " with " << index.node_sequence(mapping.position().node_id()) << std::endl; #endif // Make sure ranks are monotonically increasing along the path, or // unset. assert(mapping.rank() > last_rank || (mapping.rank() == 0 && last_rank == 0)); last_rank = mapping.rank(); } // Say that this node appears here along the reference in this // orientation. by_start[path_base] = NodeSide(mapping.position().node_id(), mapping.position().is_reverse()); // Remember that occurrence by node ID. node_occurrences[mapping.position().node_id()].push_back(by_start.find(path_base)); // Find the node's sequence std::string node_sequence = index.node_sequence(mapping.position().node_id()); while(path_base == 0 && node_sequence.size() > 0 && (node_sequence[0] != 'A' && node_sequence[0] != 'T' && node_sequence[0] != 'C' && node_sequence[0] != 'G' && node_sequence[0] != 'N')) { // If the path leads with invalid characters (like "X"), throw them // out when computing path positions. // TODO: this is a hack to deal with the debruijn-brca1-k63 graph, // which leads with an X. #pragma omp critical (cerr) std::cerr << "Warning: dropping invalid leading character " << node_sequence[0] << " from node " << mapping.position().node_id() << std::endl; node_sequence.erase(node_sequence.begin()); } if (mapping.position().is_reverse()) { // Put the reverse sequence in the path seq_stream << reverse_complement(node_sequence); } else { // Put the forward sequence in the path seq_stream << node_sequence; } // Whether we found the right place for this node in the reference or // not, we still need to advance along the reference path. We assume the // whole node (except any leading bogus characters) is included in the // path (since it sort of has to be, syntactically, unless it's the // first or last node). path_base += node_sequence.size(); // TODO: handle leading bogus characters in calls on the first node. } // Record the length of the last mapping's node, since there's no next mapping to work it out from last_node_length = path.mapping_size() > 0 ? index.node_length(path.mapping(path.mapping_size() - 1).position().node_id()) : 0; // Create the actual reference sequence we will use sequence = seq_stream.str(); #ifdef debug // Announce progress. #pragma omp critical (cerr) std::cerr << "Traced " << path_base << " bp path." << std::endl; if (sequence.size() < 100) { #pragma omp critical (cerr) std::cerr << "Sequence: " << sequence << std::endl; } #endif }