Esempio n. 1
0
PathIndex::PathIndex(const Path& path, const xg::XG& index) {
    // Trace the given path in the given XG graph, collecting sequence
    
    // We're going to build the sequence string
    std::stringstream seq_stream;
    
    // What base are we at in the path?
    size_t path_base = 0;
    
    // What was the last rank? Ranks must always go up.
    int64_t last_rank = -1;
    
    for (size_t i = 0; i < path.mapping_size(); i++) {
        auto& mapping = path.mapping(i);
    
        if (!by_id.count(mapping.position().node_id())) {
            // This is the first time we have visited this node in the path.
            
            // Add in a mapping.
            by_id[mapping.position().node_id()] = 
                std::make_pair(path_base, mapping.position().is_reverse());
#ifdef debug
            #pragma omp critical (cerr)
            std::cerr << "Node " << mapping.position().node_id() << " rank " << mapping.rank()
                << " starts at base " << path_base << " with "
                << index.node_sequence(mapping.position().node_id()) << std::endl;
#endif
            
            // Make sure ranks are monotonically increasing along the path, or
            // unset.
            assert(mapping.rank() > last_rank || (mapping.rank() == 0 && last_rank == 0));
            last_rank = mapping.rank();
        }
        
        // Say that this node appears here along the reference in this
        // orientation.
        by_start[path_base] = NodeSide(mapping.position().node_id(), mapping.position().is_reverse());
    
        // Remember that occurrence by node ID.
        node_occurrences[mapping.position().node_id()].push_back(by_start.find(path_base));
    
        // Find the node's sequence
        std::string node_sequence = index.node_sequence(mapping.position().node_id());
    
        while(path_base == 0 && node_sequence.size() > 0 &&
            (node_sequence[0] != 'A' && node_sequence[0] != 'T' && node_sequence[0] != 'C' &&
            node_sequence[0] != 'G' && node_sequence[0] != 'N')) {
            
            // If the path leads with invalid characters (like "X"), throw them
            // out when computing path positions.
            
            // TODO: this is a hack to deal with the debruijn-brca1-k63 graph,
            // which leads with an X.
            #pragma omp critical (cerr)
            std::cerr << "Warning: dropping invalid leading character "
                << node_sequence[0] << " from node " << mapping.position().node_id()
                << std::endl;
                
            node_sequence.erase(node_sequence.begin());
        }
        
        if (mapping.position().is_reverse()) {
            // Put the reverse sequence in the path
            seq_stream << reverse_complement(node_sequence);
        } else {
            // Put the forward sequence in the path
            seq_stream << node_sequence;
        }
        
        // Whether we found the right place for this node in the reference or
        // not, we still need to advance along the reference path. We assume the
        // whole node (except any leading bogus characters) is included in the
        // path (since it sort of has to be, syntactically, unless it's the
        // first or last node).
        path_base += node_sequence.size();
        
        // TODO: handle leading bogus characters in calls on the first node.
    }
    
    // Record the length of the last mapping's node, since there's no next mapping to work it out from
    last_node_length = path.mapping_size() > 0 ?
        index.node_length(path.mapping(path.mapping_size() - 1).position().node_id()) :
        0;
    
    // Create the actual reference sequence we will use
    sequence = seq_stream.str();
    
#ifdef debug
    // Announce progress.
    #pragma omp critical (cerr)
    std::cerr << "Traced " << path_base << " bp path." << std::endl;
    
    if (sequence.size() < 100) {
        #pragma omp critical (cerr)
        std::cerr << "Sequence: " << sequence << std::endl;
    }
#endif

}