Exemple #1
0
thread_t path_to_thread_t(Path& path) {
  thread_t t;
  for(size_t i = 0; i < path.mapping_size(); i++) {
    Mapping mapping = path.mapping(i);
    auto pos = mapping.position();
    XG::ThreadMapping m = {pos.node_id(), pos.is_reverse()};
    t.push_back(m);
  }
  return t;
}
Exemple #2
0
void flip_nodes(Alignment& a, set<int64_t> ids, const std::function<size_t(int64_t)>& node_length) {
    Path* path = a.mutable_path();
    for(size_t i = 0; i < path->mapping_size(); i++) {
        // Grab each mapping (includes its position)
        Mapping* mapping = path->mutable_mapping(i);
        if(ids.count(mapping->position().node_id())) {
            // We need to flip this mapping
            *mapping = reverse_mapping(*mapping, node_length);
        } 
    }
}
Exemple #3
0
/// Find the region of the Mapping's node used by the Mapping, in forward strand space, as start to past_end.
static pair<size_t, size_t> mapping_to_range(const xg::XG* xg_index, const Mapping& mapping) {
    // How much of the node does it cover?
    auto mapping_length = mapping_from_length(mapping);
    
    // Work out where the start and past-end positions on the node's forward strand are.
    pair<size_t, size_t> node_range;
    if (mapping.position().is_reverse()) {
        // On the reverse strand we need the node length
        // TODO: getting it can be slow
        auto node_length = xg_index->node_length(mapping.position().node_id());
        
        node_range.first = node_length - mapping.position().offset() - mapping_length;
        node_range.second = node_length - mapping.position().offset();
    } else {
        // On the forward strand this is easy
        node_range.first = mapping.position().offset();
        node_range.second = node_range.first + mapping_length;
    }
    
    return node_range;
}
Exemple #4
0
vector<int> Vectorizer::alignment_to_a_hot(Alignment a){
    int64_t entity_size = my_xg->node_count + my_xg->edge_count;
    vector<int> ret(entity_size, 0);
    Path path = a.path();
    for (int i = 0; i < path.mapping_size(); i++){
        Mapping mapping = path.mapping(i);
        if(! mapping.has_position()){
            continue;
        }
        Position pos = mapping.position();
        int64_t node_id = pos.node_id();
        int64_t key = my_xg->node_rank_as_entity(node_id);
        // Okay, solved the previous out of range errors:
        // We have to use an entity-space that is |nodes + edges + 1|
        // as nodes are indexed from 1, not from 0.
        //TODO: this means we may one day have to do the same bump up
        // by one for edges, as I assume they are also indexed starting at 1.
        //cerr << key << " - " << entity_size << endl;

        //Find edge by current / previous node ID
        // we can check the orientation, though it shouldn't **really** matter
        // whether we catch them in the forward or reverse direction.
        if (i > 0){
            Mapping prev_mapping = path.mapping(i - 1);
            Position prev_pos = prev_mapping.position();
            int64_t prev_node_id = prev_pos.node_id();
            if (my_xg->has_edge(prev_node_id, false, node_id, false)){
                int64_t edge_key = my_xg->edge_rank_as_entity(prev_node_id, false, node_id, false);
                vector<size_t> edge_paths = my_xg->paths_of_entity(edge_key);
                if (edge_paths.size() > 0){
                    ret[edge_key - 1] = 1;
                }
                else{
                    ret[edge_key - 1] = 2;
                }
            }
        }
        //Check if the node of interest is on a path
        vector<size_t> node_paths = my_xg->paths_of_node(node_id);
        if (node_paths.size() > 0){
            ret[key - 1] = 2;
        }
        else{
            ret[key - 1] = 1;
        }

    }

    return ret;

}
Exemple #5
0
vector<double> Vectorizer::alignment_to_identity_hot(Alignment a){
    int64_t entity_size = my_xg->node_count + my_xg->edge_count;
    vector<double> ret(entity_size, 0.0);

    Path path = a.path();
    for (int i = 0; i < path.mapping_size(); i ++){
        Mapping mapping = path.mapping(i);
        if(! mapping.has_position()){
            continue;
        }
        Position pos = mapping.position();
        int64_t node_id = pos.node_id();
        int64_t key = my_xg->node_rank_as_entity(node_id);

        //Calculate % identity by walking the edits and counting matches.
        double pct_id = 0.0;
        double match_len = 0.0;
        double total_len = 0.0;

        for (int j = 0; j < mapping.edit_size(); j++){
            Edit e = mapping.edit(j);
            total_len += e.from_length();
            if (e.from_length() == e.to_length() && e.sequence() == ""){
                match_len += (double) e.to_length();
            }
            else if (e.from_length() == e.to_length() && e.sequence() != ""){
                // TODO if we map but don't match exactly, add half the average length to match_length
                //match_len += (double) (0.5 * ((double) e.to_length()));
            }
            else{
                
            }
            
        }
        pct_id = (match_len == 0.0 && total_len == 0.0) ? 0.0 : (match_len / total_len);
        ret[key - 1] = pct_id;

        if (i > 0){
            Mapping prev_mapping = path.mapping(i - 1);
            Position prev_pos = prev_mapping.position();
            int64_t prev_node_id = prev_pos.node_id();
            if (my_xg->has_edge(prev_node_id, false, node_id, false)){
                int64_t edge_key = my_xg->edge_rank_as_entity(prev_node_id, false, node_id, false);
                ret[edge_key - 1] = 1.0;
            }
        }
    }
    return ret;
}
Exemple #6
0
    /**
     * Looks for alignments that change direction over their length.
     * This may happen because of:
     * 1. Mapping artifacts
     * 2. Cycles
     * 3. Highly repetitive regions
     * 4. Inversions (if you're lucky enough)
     *
     * Default behavior: if the Alignment reverses, return an empty Alignment.
     * inverse behavior: if the Alignment reverses, return the Alignment.
     */
    Alignment Filter::reversing_filter(Alignment& aln){

        Path path = aln.path();
        bool prev = false;

        for (int i = 1; i < path.mapping_size(); i++){
            Mapping mapping = path.mapping(i);
            Position pos = mapping.position();
            bool prev = path.mapping(i - 1).position().is_reverse();
            if (prev != pos.is_reverse()){
                return inverse ? aln : Alignment();
            }

        }
        return inverse ? Alignment() : aln;

    }
Exemple #7
0
    /**
     *
     * Looks for alignments that transition from one path to another
     * over their length. This may occur for one of several reasons:
     * 1. The read covers a translocation
     * 2. The read looks a lot like two different (but highly-similar paths)
     * 3. The read is shattered (e.g. as in chromothripsis)
     *
     * Default behavior: if the Alignment is path divergent, return an empty Alignment, else return aln
     * Inverse behavior: if the Alignment is path divergent, return aln, else return an empty Alignment
     */
    Alignment Filter::path_divergence_filter(Alignment& aln){
        Path path = aln.path();
        for (int i = 1; i < path.mapping_size(); i++){
            Mapping mapping = path.mapping(i);
            Position pos = mapping.position();
            id_t current_node = pos.node_id();
            id_t prev_node = path.mapping(i - 1).position().node_id();
            bool paths_match = false;
            vector<size_t> paths_of_prev = my_xg_index->paths_of_node(prev_node);
            for (int i = 0; i < paths_of_prev.size(); i++){
                string p_name = my_xg_index->path_name(paths_of_prev[i]);
                if (my_xg_index->path_contains_node(p_name, current_node)){
                    paths_match = true;
                }
            }
            if (!paths_match){
                return inverse ? aln : Alignment();
            }

        }
        return inverse ? Alignment() : aln;
    }
Exemple #8
0
bit_vector Vectorizer::alignment_to_onehot(Alignment a){
    // Make a vector as large as the | |nodes| + |edges| | space
    // TODO handle edges
    int64_t entity_size = my_xg->node_count + my_xg->edge_count;
    bit_vector ret(entity_size, 0);
    Path path = a.path();
    for (int i = 0; i < path.mapping_size(); i++){
        Mapping mapping = path.mapping(i);
        Position pos = mapping.position();
        int64_t node_id = pos.node_id();
        int64_t key = my_xg->node_rank_as_entity(node_id);
        // Okay, solved the previous out of range errors:
        // We have to use an entity-space that is |nodes + edges + 1|
        // as nodes are indexed from 1, not from 0.
        //TODO: this means we may one day have to do the same bump up
        // by one for edges, as I assume they are also indexed starting at 1.
        //cerr << key << " - " << entity_size << endl;

        //Find edge by current / previous node ID
        // we can check the orientation, though it shouldn't **really** matter
        // whether we catch them in the forward or reverse direction.
        if (i > 0){
            Mapping prev_mapping = path.mapping(i - 1);
            Position prev_pos = prev_mapping.position();
            int64_t prev_node_id = prev_pos.node_id();
            if (my_xg->has_edge(prev_node_id, false, node_id, false)){
                int64_t edge_key = my_xg->edge_rank_as_entity(prev_node_id, false, node_id, false);
                ret[edge_key - 1] = 1;
            }
        }
        //Find entity rank of edge

        ret[key - 1] = 1;
    }

    return ret;
}
Exemple #9
0
void Pileups::compute_from_edit(NodePileup& pileup, int64_t& node_offset,
                                int64_t& read_offset,
                                const Node& node, const Alignment& alignment,
                                const Mapping& mapping, const Edit& edit) {
    string seq = edit.sequence();
    bool is_reverse = mapping.position().is_reverse();
    
    // ***** MATCH *****
    if (edit.from_length() == edit.to_length()) {
        assert (edit.from_length() > 0);
        make_match(seq, edit.from_length(), is_reverse);
        assert(seq.length() == edit.from_length());            
        int64_t delta = 1;
        for (int64_t i = 0; i < edit.from_length(); ++i) {
            BasePileup* base_pileup = get_create_base_pileup(pileup, node_offset);
            // reference_base if empty
            if (base_pileup->num_bases() == 0) {
                base_pileup->set_ref_base(node.sequence()[node_offset]);
            } else {
                assert(base_pileup->ref_base() == node.sequence()[node_offset]);
            }
            // add base to bases field (converting to ,. if match)
            char base = seq[i];
            if (!edit.sequence().empty() &&
                base_equal(seq[i], node.sequence()[node_offset], is_reverse)) {
                base = is_reverse ? ',' : '.';
            }
            *base_pileup->mutable_bases() += base;
            // add quality if there
            if (!alignment.quality().empty()) {
                *base_pileup->mutable_qualities() += alignment.quality()[read_offset];
            }
            // pileup size increases by 1
            base_pileup->set_num_bases(base_pileup->num_bases() + 1);
            // move right along read, and left/right depending on strand on reference
            node_offset += delta;
            ++read_offset;
        }
    }
    // ***** INSERT *****
    else if (edit.from_length() < edit.to_length()) {
        make_insert(seq, is_reverse);
        assert(edit.from_length() == 0);
        // we define insert (like sam) as insertion between current and next
        // position (on forward node coordinates). this means an insertion before
        // offset 0 is invalid! 
        int64_t insert_offset =  is_reverse ? node_offset : node_offset - 1;
        if (insert_offset >= 0) {        
            BasePileup* base_pileup = get_create_base_pileup(pileup, insert_offset);
            // reference_base if empty
            if (base_pileup->num_bases() == 0) {
                base_pileup->set_ref_base(node.sequence()[insert_offset]);
            } else {
                assert(base_pileup->ref_base() == node.sequence()[insert_offset]);
            }
            // add insertion string to bases field
            // todo: should we reverse complement this if mapping is reversed ??? 
            base_pileup->mutable_bases()->append(seq);
            if (!alignment.quality().empty()) {
                *base_pileup->mutable_qualities() += alignment.quality()[read_offset];
            }
            // pileup size increases by 1
            base_pileup->set_num_bases(base_pileup->num_bases() + 1);
        }
        else {
            // todo: need to either forget about these, or extend pileup format.
            // easy solution: change insert to come before position, and just add
            // optional pileup at n+1st base of node.  would like to figure out
            // how samtools does it first...
            /*
            stringstream ss;
            ss << "Warning: pileup does not support insertions before 0th base in node."
               << " Offending edit: " << pb2json(edit) << endl;
#pragma omp critical(cerr)
            cerr << ss.str();
            */
        }
        // move right along read (and stay put on reference)
        read_offset += edit.to_length();
    }
    // ***** DELETE *****
    else {
        assert(edit.to_length() == 0);
        assert(edit.sequence().empty());
        int64_t del_start = !is_reverse ? node_offset :
            node_offset - edit.from_length() + 1;
        seq = node.sequence().substr(del_start, edit.from_length());
        make_delete(seq, is_reverse);
        BasePileup* base_pileup = get_create_base_pileup(pileup, node_offset);
        // reference_base if empty
        if (base_pileup->num_bases() == 0) {
            base_pileup->set_ref_base(node.sequence()[node_offset]);
        } else {
            assert(base_pileup->ref_base() == node.sequence()[node_offset]);
        }
        // add deletion string to bases field
        // todo: should we reverse complement this if mapping is reversed ??? 
        base_pileup->mutable_bases()->append(seq);
        if (!alignment.quality().empty()) {
            *base_pileup->mutable_qualities() += alignment.quality()[read_offset];
        }
        // pileup size increases by 1
        base_pileup->set_num_bases(base_pileup->num_bases() + 1);
        int64_t delta = edit.from_length();
        // stay put on read, move left/right depending on strand on reference
        node_offset += delta;
    }
}
Exemple #10
0
    void Deconstructor::sb2vcf(string outfile){
        Header h;
        h.set_date();
        h.set_source("VG");
        h.set_reference("");
        h.set_version("VCF4.2");

        cout << h << endl;

        // for each superbubble:
        // Fill out a vcflib Variant
        // Check if it is masked by an input vcf
        // if not, print it to stdout



        map<id_t, vcflib::Variant> node_to_var;
        vcflib::VariantCallFile mask;
        if (!mask_file.empty()){
            //node_to_var = my_vg->get_node_to_variant(mask);
        }
        for (auto s : my_sbs){
            vcflib::Variant var;

            // Make subgraphs out of the superbubble:
            // Operating on a pair<id_t, id_t>, vector<id_t>
            // then enumerate k_paths through the SuperBubbles
            set<Node*> nodes;
            set<Edge*> edges;

            for (int i = 0; i < s.second.size(); i++){
                id_t n_id = s.second[i];
                //cerr << n_id << endl;
                Node* n_node = my_vg->get_node(n_id);
                vector<Edge*> e_end = my_vg->edges_from(n_node);
                nodes.insert(n_node);
                if (i < s.second.size() - 1){
                    edges.insert(e_end.begin(), e_end.end());
                }
            }

            vg::VG t_graph = vg::VG(nodes, edges);

            vector<Path> paths;

            std::function<void(NodeTraversal)> no_op = [](NodeTraversal n){};
            std::function<void(size_t, Path&)> extract_path = [&paths](size_t x_size, Path& path){
                paths.push_back(path);
            };

            t_graph.for_each_kpath(10000, false, 100, no_op, no_op, extract_path);

            std::function<std::vector<Path>(vector<Path>)> uniquify = [](vector<Path> v){
                map<string, Path> unqs;
                vector<Path> ret;
                for (auto x: v){
                    unqs[path_to_string(x)] = x;
                }

                for (auto y : unqs){
                    ret.push_back(y.second);
                }
                return ret;
            };

            paths = uniquify(paths);

            std::function<bool(Path)> all_ref = [&](Path p){
                for (int i = 0; i < p.mapping_size(); i++){
                    Mapping m = p.mapping(i);
                    Position pos = m.position();
                    vg::id_t pos_id = pos.node_id();
                    map<string, set<Mapping*> > path_to_mappings =  my_vg->paths.get_node_mapping(pos_id);

                    if (path_to_mappings.size() <= 0){
                        return false;
                    }
                }
                return true;
            };

            /*
             * This means we now have vectors for the superbubble
             * that have the paths through the nodes within it (including end nodes)
             * however, these paths are repeated several times.
             * We should find a way to prevent them being inserted once for each node.
             *
             * Next on the agenda: use the get_path_dist thing from vg call / vg stats
             * to get the distance to the head node.
             * Might need an XG index for this.
             *
             * Also need a way to deal with GAMs for this i.e. a way to 
             * count the number of times we see something come up in the gam
             */
            int first_len = (my_vg->get_node(1))->sequence().size();
            map<string, set<Mapping*> > p_to_mappings =  my_vg->paths.get_node_mapping(s.first.first);
            for (auto p_name : p_to_mappings){
                var.sequenceName = p_name.first;
            }
            var.position = my_xg->approx_path_distance(var.sequenceName, 1, s.first.first) + (s.first.first == 1 ? 0 : first_len);

            //var.sequenceName = my_vg->paths.get_node_mapping(pos_id);
            //
            for (auto x : paths){
                //cerr << path_to_string(x) << endl;
                stringstream ref_seq;
                stringstream alt_seq;
                bool is_ref = true;

                for (int m_i = 1; m_i < x.mapping_size() -1 ; m_i++){
                    Mapping m = x.mapping(m_i);
                    id_t pos_id = m.position().node_id();
                    Node* n = my_vg->get_node(pos_id);
                    string n_seq = n->sequence();
                    map<string, set<Mapping*> > path_to_mappings =  my_vg->paths.get_node_mapping(pos_id);
                    if (path_to_mappings.size() == 0){
                        is_ref = false;
                    }

                    if (is_ref){
                        ref_seq << n_seq;
                    }
                    alt_seq << n_seq;

                    //cerr << " REF: " << ref_seq.str() << " ALT: " << alt_seq.str() << endl;

                }

                if (is_ref){
                    if(var.ref.empty()){
                        string ref_str = ref_seq.str();
                        var.ref = ref_str; //(ref_str.size() > 0) ? ref_str : (my_vg->get_node(s.first.first))->sequence();
                        var.alleles.insert(var.alleles.begin(), var.ref);
                    }
                }
                else{
                    string alt_string = alt_seq.str();
                    var.alt.push_back(alt_string);
                    var.alleles.push_back(alt_string);
                }

            }
            if (! (var.ref.empty() && var.alt.empty()) ){
                cout << var << endl;
            }

        }

    }
Exemple #11
0
pair<size_t, size_t> Simplifier::simplify_once(size_t iteration) {

    // Set up the deleted node and edge counts
    pair<size_t, size_t> to_return {0, 0};
    auto& deleted_nodes = to_return.first;
    auto& deleted_edges = to_return.second;

    if(!graph.is_valid(true, true, true, true)) {
        // Make sure the graph is valid and not missing nodes or edges
        cerr << "error:[vg::Simplifier] Invalid graph on iteration " << iteration << endl;
        exit(1);
    }

    // Make a list of leaf sites
    list<const Snarl*> leaves;
    
    if (show_progress) {
        cerr << "Iteration " << iteration << ": Scanning " << graph.node_count() << " nodes and "
            << graph.edge_count() << " edges for sites..." << endl;
    }
    
    for (const Snarl* top_level_site : site_manager.top_level_snarls()) {
        list<const Snarl*> queue {top_level_site};
        
        while (queue.size()) {
            const Snarl* site = queue.front();
            queue.pop_front();
            
            if (site_manager.is_leaf(site)) {
                // It's a leaf. Filter it out if it is trivial
                
                if (site->type() == ULTRABUBBLE) {
                    auto contents = site_manager.shallow_contents(site, graph, false);
                    if (contents.first.empty()) {
                        // Nothing but the boundary nodes in this snarl
                        continue;
                    }
                }
            
                // Not trivial. Keep it.
                leaves.push_back(site);
            }
            else {
                for (const Snarl* child_site : site_manager.children_of(site)) {
                    queue.push_back(child_site);
                }
            }
        }
    }
    
    if (show_progress) {
        cerr << "Found " << leaves.size() << " leaves" << endl;
    }
    
    // Index all the graph paths
    map<string, unique_ptr<PathIndex>> path_indexes;
    graph.paths.for_each_name([&](const string& name) {
        // For every path name, go index it and put it in this collection
        path_indexes.insert(make_pair(name, move(unique_ptr<PathIndex>(new PathIndex(graph, name)))));
    });
    
    // Now we have a list of all the leaf sites.
    create_progress("simplifying leaves", leaves.size());
    
    // We can't use the SnarlManager after we modify the graph, so we load the
    // contents of all the leaves we're going to modify first.
    map<const Snarl*, pair<unordered_set<Node*>, unordered_set<Edge*>>> leaf_contents;
    
    // How big is each leaf in bp
    map<const Snarl*, size_t> leaf_sizes;
    
    // We also need to pre-calculate the traversals for the snarls that are the
    // right size, since the traversal finder uses the snarl manager amd might
    // not work if we modify the graph.
    map<const Snarl*, vector<SnarlTraversal>> leaf_traversals;
    
    for (const Snarl* leaf : leaves) {
        // Look at all the leaves
        
        // Get the contents of the bubble, excluding the boundary nodes
        leaf_contents[leaf] = site_manager.deep_contents(leaf, graph, false);
        
        // For each leaf, calculate its total size.
        unordered_set<Node*>& nodes = leaf_contents[leaf].first;
        size_t& total_size = leaf_sizes[leaf];
        for (Node* node : nodes) {
            // For each node include it in the size figure
            total_size += node->sequence().size();
        }
        
        if (total_size == 0) {
            // This site is just the start and end nodes, so it doesn't make
            // sense to try and remove it.
            continue;
        }
        
        if (total_size >= min_size) {
            // This site is too big to remove
            continue;
        }
        
        // Identify the replacement traversal for the bubble if it's the right size.
        // We can't necessarily do this after we've modified the graph.
        vector<SnarlTraversal>& traversals = leaf_traversals[leaf];
        traversals = traversal_finder.find_traversals(*leaf);
    }
    
    for (const Snarl* leaf : leaves) {
        // Look at all the leaves
        
        // Get the contents of the bubble, excluding the boundary nodes
        unordered_set<Node*>& nodes = leaf_contents[leaf].first;
        unordered_set<Edge*>& edges = leaf_contents[leaf].second;
        
        // For each leaf, grab its total size.
        size_t& total_size = leaf_sizes[leaf];
        
        if (total_size == 0) {
            // This site is just the start and end nodes, so it doesn't make
            // sense to try and remove it.
            continue;
        }
        
        if (total_size >= min_size) {
            // This site is too big to remove
            continue;
        }
        
#ifdef debug
        cerr << "Found " << total_size << " bp leaf" << endl;
        for (auto* node : nodes) {
            cerr << "\t" << node->id() << ": " << node->sequence() << endl;
        }
#endif
        
        // Otherwise we want to simplify this site away
        
        // Grab the replacement traversal for the bubble
        vector<SnarlTraversal>& traversals = leaf_traversals[leaf];
        
        if (traversals.empty()) {
            // We couldn't find any paths through the site.
            continue;
        }
        
        // Get the traversal out of the vector
        SnarlTraversal& traversal = traversals.front();
        
        // Determine the length of the new traversal
        size_t new_site_length = 0;
        for (size_t i = 1; i < traversal.visit_size() - 1; i++) {
            // For every non-anchoring node
            const Visit& visit = traversal.visit(i);
            // Total up the lengths of all the nodes that are newly visited.
            assert(visit.node_id());
            new_site_length += graph.get_node(visit.node_id())->sequence().size();
        }

#ifdef debug
        cerr << "Chosen traversal is " << new_site_length << " bp" << endl;
#endif
        
        // Now we have to rewrite paths that visit nodes/edges not on this
        // traversal, or in a different order, or whatever. To be safe we'll
        // just rewrite all paths.
        
        // Find all the paths that traverse this region.
        
        // We start at the start node. Copy out all the mapping pointers on that
        // node, so we can go through them while tampering with them.
        map<string, set<Mapping*> > mappings_by_path = graph.paths.get_node_mapping(graph.get_node(leaf->start().node_id()));
        
        // It's possible a path can enter the site through the end node and
        // never hit the start. So we're going to trim those back before we delete nodes and edges.
        map<string, set<Mapping*> > end_mappings_by_path = graph.paths.get_node_mapping(graph.get_node(leaf->end().node_id()));
        
        if (!drop_hairpin_paths) {
            // We shouldn't drop paths if they hairpin and can't be represented
            // in a simplified bubble. So we instead have to not simplify
            // bubbles that would have that problem.
            bool found_hairpin = false;
            
            for (auto& kv : mappings_by_path) {
                // For each path that hits the start node
                
                if (found_hairpin) {
                    // We only care if there are 1 or more hairpins, not how many
                    break;    
                }
                
                // Unpack the name
                auto& path_name = kv.first;
                
                for (Mapping* start_mapping : kv.second) {
                    // For each visit to the start node
                
                    if (found_hairpin) {
                        // We only care if there are 1 or more hairpins, not how many
                        break;    
                    }
                
                    // Determine what orientation we're going to scan in
                    bool backward = start_mapping->position().is_reverse();
                    
                    // Start at the start node
                    Mapping* here = start_mapping;
                    
                    while (here) {
                        // Until we hit the start/end of the path or the mapping we want
                        if (here->position().node_id() == leaf->end().node_id() &&
                            here->position().is_reverse() == (leaf->end().backward() != backward)) {
                            // We made it out.
                            // Stop scanning!
                            break;
                        }
                        
                        if (here->position().node_id() == leaf->start().node_id() &&
                            here->position().is_reverse() != (leaf->start().backward() != backward)) {
                            // We have encountered the start node with an incorrect orientation.
                            cerr << "warning:[vg simplify] Path " << path_name
                                << " doubles back through start of site "
                                << to_node_traversal(leaf->start(), graph) << " - "
                                << to_node_traversal(leaf->end(), graph) << "; skipping site!" << endl;
                                
                            found_hairpin = true;
                            break;
                        }
                        
                        // Scan left along ther path if we found the site start backwards, and right if we found it forwards.
                        here = backward ? graph.paths.traverse_left(here) : graph.paths.traverse_right(here);
                    }
                }
            }
            
            for (auto& kv : end_mappings_by_path) {
                // For each path that hits the end node
                
                if (found_hairpin) {
                    // We only care if there are 1 or more hairpins, not how many
                    break;
                }
                
                // Unpack the name
                auto& path_name = kv.first;
                
                for (Mapping* end_mapping : kv.second) {
                    
                    if (found_hairpin) {
                        // We only care if there are 1 or more hairpins, not how many
                        break;
                    }
                    
                    // Determine what orientation we're going to scan in
                    bool backward = end_mapping->position().is_reverse();
                    
                    // Start at the end
                    Mapping* here = end_mapping;
                    
                    while (here) {
                        
                        if (here->position().node_id() == leaf->start().node_id() &&
                            here->position().is_reverse() == (leaf->start().backward() != backward)) {
                            // We made it out.
                            // Stop scanning!
                            break;
                        }
                        
                        if (here->position().node_id() == leaf->end().node_id() &&
                            here->position().is_reverse() != (leaf->end().backward() != backward)) {
                            // We have encountered the end node with an incorrect orientation.
                            cerr << "warning:[vg simplify] Path " << path_name
                                << " doubles back through end of site "
                                << to_node_traversal(leaf->start(), graph) << " - "
                                << to_node_traversal(leaf->end(), graph) << "; dropping site!" << endl;
                            
                            found_hairpin = true;
                            break;
                        }
                        
                        // Scan right along the path if we found the site end backwards, and left if we found it forwards.
                        here = backward ? graph.paths.traverse_right(here) : graph.paths.traverse_left(here);
                        
                    }
                    
                }
                    
            }
            
            if (found_hairpin) {
                // We found a hairpin, so we want to skip the site.
                cerr << "warning:[vg simplify] Site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << " skipped due to hairpin path." << endl;
                continue;
            }
            
        }
        
        // We'll keep a set of the end mappings we managed to find, starting from the start
        set<Mapping*> found_end_mappings;
        
        for (auto& kv : mappings_by_path) {
            // For each path that hits the start node
            
            // Unpack the name
            auto& path_name = kv.first;
            
            // If a path can't be represented after a bubble is popped
            // (because the path reversed and came out the same side as it
            // went in), we just clobber the path entirely. TODO: handle
            // out-the-same-side traversals as valid genotypes somehow..
            bool kill_path = false;
            
            for (Mapping* start_mapping : kv.second) {
                // For each visit to the start node
                
                // Determine what orientation we're going to scan in
                bool backward = start_mapping->position().is_reverse();
                
                // We're going to fill this list with the mappings we need to
                // remove and replace in this path for this traversal. Initially
                // runs from start of site to end of site, but later gets
                // flipped into path-local orientation.
                list<Mapping*> existing_mappings;
                
                // Tracing along forward/backward from each as appropriate, see
                // if the end of the site is found in the expected orientation
                // (or if the path ends first).
                bool found_end = false;
                Mapping* here = start_mapping;
                
                // We want to remember the end mapping when we find it
                Mapping* end_mapping = nullptr;
                
#ifdef debug
                cerr << "Scanning " << path_name << " from " << pb2json(*here)
                    << " for " << to_node_traversal(leaf->end(), graph) << " orientation " << backward << endl;
#endif
                
                while (here) {
                    // Until we hit the start/end of the path or the mapping we want
                    
#ifdef debug
                    cerr << "\tat " << pb2json(*here) << endl;
#endif
                    
                    if (here->position().node_id() == leaf->end().node_id() &&
                        here->position().is_reverse() == (leaf->end().backward() != backward)) {
                        // We have encountered the end of the site in the
                        // orientation we expect, given the orientation we saw
                        // for the start.
                        
                        found_end = true;
                        end_mapping = here;
                        
                        // Know we got to this mapping at the end from the
                        // start, so we don't need to clobber everything
                        // before it.
                        found_end_mappings.insert(here);
                        
                        // Stop scanning!
                        break;
                    }
                    
                    if (here->position().node_id() == leaf->start().node_id() &&
                        here->position().is_reverse() != (leaf->start().backward() != backward)) {
                        // We have encountered the start node with an incorrect orientation.
                        cerr << "warning:[vg simplify] Path " << path_name
                            << " doubles back through start of site "
                            << to_node_traversal(leaf->start(), graph) << " - "
                            << to_node_traversal(leaf->end(), graph) << "; dropping!" << endl;
                            
                        assert(drop_hairpin_paths);
                        kill_path = true;
                        break;
                    }
                    
                    if (!nodes.count(graph.get_node(here->position().node_id()))) {
                        // We really should stay inside the site!
                        cerr << "error:[vg simplify] Path " << path_name
                            << " somehow escapes site " << to_node_traversal(leaf->start(), graph)
                            << " - " << to_node_traversal(leaf->end(), graph) << endl;
                            
                        exit(1);
                    }
                    
                    if (here != start_mapping) {
                        // Remember the mappings that aren't to the start or
                        // end of the site, so we can remove them later.
                        existing_mappings.push_back(here);
                    }
                    
                    // Scan left along ther path if we found the site start backwards, and right if we found it forwards.
                    Mapping* next = backward ? graph.paths.traverse_left(here) : graph.paths.traverse_right(here);
                    
                    if (next == nullptr) {
                        // We hit the end of the path without finding the end of the site.
                        // We've found all the existing mappings, so we can stop.
                        break;
                    }
                    
                    // Make into NodeTraversals
                    NodeTraversal here_traversal(graph.get_node(here->position().node_id()), here->position().is_reverse());
                    NodeTraversal next_traversal(graph.get_node(next->position().node_id()), next->position().is_reverse());
                    
                    if (backward) {
                        // We're scanning the other way
                        std::swap(here_traversal, next_traversal);
                    }
                    
                    // Make sure we have an edge so we can traverse this node and then the node we're going to.
                    if(graph.get_edge(here_traversal, next_traversal) == nullptr) {
                        cerr << "error:[vg::Simplifier] No edge " << here_traversal << " to " << next_traversal << endl;
                        exit(1);
                    }
                    
                    here = next;
                }
                
                if (kill_path) {
                    // This path can't exist after we pop this bubble.
                    break;
                }
                
                
                if (!found_end) {
                    // This path only partly traverses the site, and is
                    // anchored at the start. Remove the part inside the site.
                    
                    // TODO: let it stay if it matches the one true traversal.
                                 
                    for(auto* mapping : existing_mappings) {
                        // Trim the path out of the site
                        graph.paths.remove_mapping(mapping);
                    }
                    
                    // TODO: update feature positions if we trim off the start of a path

                    // Maybe the next time the path visits the site it will go
                    // all the way through.
                    continue;
                }
                
                // If we found the end, remove all the mappings encountered, in
                // order so that the last one removed is the last one along the
                // path.
                if (backward) {
                    // Make sure the last mapping in the list is the last
                    // mapping to occur along the path.
                    existing_mappings.reverse();
                }
                
                // Where does the variable region of the site start for this
                // traversal of the path? If there are no existing mappings,
                // it's the start mapping's position if we traverse the site
                // backwards and the end mapping's position if we traverse
                // the site forwards. If there are existing mappings, it's
                // the first existing mapping's position in the path. TODO:
                // This is super ugly. Can we view the site in path
                // coordinates or something?
                PathIndex& path_index = *path_indexes.at(path_name).get();
                Mapping* mapping_after_first = existing_mappings.empty() ?
                    (backward ? start_mapping : end_mapping) : existing_mappings.front();
                assert(path_index.mapping_positions.count(mapping_after_first));
                size_t variable_start = path_index.mapping_positions.at(mapping_after_first); 
                
                
                
                // Determine the total length of the old traversal of the site
                size_t old_site_length = 0;
                for (auto* mapping : existing_mappings) {
                    // Add in the lengths of all the mappings that will get
                    // removed.
                    old_site_length += mapping_from_length(*mapping);
                }
#ifdef debug
                cerr << "Replacing " << old_site_length << " bp at " << variable_start
                    << " with " << new_site_length << " bp" << endl;
#endif

                // Actually update any BED features
                features.on_path_edit(path_name, variable_start, old_site_length, new_site_length);
                
                // Where will we insert the new site traversal into the path?
                list<Mapping>::iterator insert_position;
                
                if (!existing_mappings.empty()) {
                    // If there are existing internal mappings, we'll insert right where they were
                    
                    for (auto* mapping : existing_mappings) {
                        // Remove each mapping from left to right along the
                        // path, saving the position after the mapping we just
                        // removed. At the end we'll have the position of the
                        // mapping to the end of the site.
                        
#ifdef debug
                        cerr << path_name << ": Drop mapping " << pb2json(*mapping) << endl;
#endif
                        
                        insert_position = graph.paths.remove_mapping(mapping);
                    }
                } else {
                    // Otherwise we'll insert right before the mapping to
                    // the start or end of the site (whichever occurs last
                    // along the path)
                    insert_position = graph.paths.find_mapping(backward ? start_mapping : here);
                }
                
                // Make sure we're going to insert starting from the correct end of the site.
                if (backward) {
                    assert(insert_position->position().node_id() == leaf->start().node_id());
                } else {
                    assert(insert_position->position().node_id() == leaf->end().node_id());
                }
                
                // Loop through the internal visits in the canonical
                // traversal backwards along the path we are splicing. If
                // it's a forward path this is just right to left, but if
                // it's a reverse path it has to be left to right.
                for (size_t i = 0; i < traversal.visit_size(); i++) {
                    // Find the visit we need next, as a function of which
                    // way we need to insert this run of visits. Normally we
                    // go through the visits right to left, but when we have
                    // a backward path we go left to right.
                    const Visit& visit = backward ? traversal.visit(i)
                                                  : traversal.visit(traversal.visit_size() - i - 1);
                    
                    // Make a Mapping to represent it
                    Mapping new_mapping;
                    new_mapping.mutable_position()->set_node_id(visit.node_id());
                    // We hit this node backward if it's backward along the
                    // traversal, xor if we are traversing the traversal
                    // backward
                    new_mapping.mutable_position()->set_is_reverse(visit.backward() != backward);
                    
                    // Add an edit
                    Edit* edit = new_mapping.add_edit();
                    size_t node_seq_length = graph.get_node(visit.node_id())->sequence().size();
                    edit->set_from_length(node_seq_length);
                    edit->set_to_length(node_seq_length);
                    
#ifdef debug
                    cerr << path_name << ": Add mapping " << pb2json(new_mapping) << endl;
#endif
                    
                    // Insert the mapping in the path, moving right to left
                    insert_position = graph.paths.insert_mapping(insert_position, path_name, new_mapping);
                    
                }
                
                // Now we've corrected this site on this path. Update its index.
                // TODO: right now this means retracing the entire path.
                path_indexes[path_name].get()->update_mapping_positions(graph, path_name);
            }
            
            if (kill_path) {
                // Destroy the path completely, because it needs to reverse
                // inside a site that we have popped.
                graph.paths.remove_path(path_name);
            }
            
        }
        
        for (auto& kv : end_mappings_by_path) {
            // Now we handle the end mappings not reachable from the start. For each path that touches the end...
        
            // Unpack the name
            auto& path_name = kv.first;
            
            // We might have to kill the path, if it reverses inside a
            // bubble we're popping
            bool kill_path = false;
            
            for (Mapping* end_mapping : kv.second) {
                if (found_end_mappings.count(end_mapping)) {
                    // Skip the traversals of the site that we handled.
                    continue;
                }
                
                // Now we're left with paths that leave the site but don't
                // enter. We're going to clobber everything before the path
                // leaves the site.
                
                // Determine what orientation we're going to scan in
                bool backward = end_mapping->position().is_reverse();
                
                // Start at the end
                Mapping* here = end_mapping;
                
                // Keep a list of mappings we need to remove
                list<Mapping*> to_remove;
                
                while (here) {
                    
                    if (here->position().node_id() == leaf->end().node_id() &&
                        here->position().is_reverse() != (leaf->end().backward() != backward)) {
                        // We have encountered the end node with an incorrect orientation.
                        cerr << "warning:[vg simplify] Path " << path_name
                            << " doubles back through end of site "
                            << to_node_traversal(leaf->start(), graph) << " - "
                            << to_node_traversal(leaf->end(), graph) << "; dropping!" << endl;
                            
                        assert(drop_hairpin_paths);
                        kill_path = true;
                        break;
                    }
                    
                    // Say we should remove the mapping.
                    to_remove.push_back(here);
                    
                    // Scan right along the path if we found the site end backwards, and left if we found it forwards.
                    here = backward ? graph.paths.traverse_right(here) : graph.paths.traverse_left(here);
                    
                    // Eventually we should hit the end of the path, or the
                    // end of the site, since we don't hit the start.
                    
                }
                
                if (kill_path) {
                    // Just go kill the whole path
                    break;
                }
                
                for (auto* mapping: to_remove) {
                    // Get rid of all the mappings once we're done tracing them out.
                    graph.paths.remove_mapping(mapping);
                }
                
            }
            
            if (kill_path) {
                // Destroy the path completely, because it needs to reverse
                // inside a site that we have popped.
                graph.paths.remove_path(path_name);
            }
        }
        
        // Now delete all edges that aren't connecting adjacent nodes on the
        // blessed traversal (before we delete their nodes).
        set<Edge*> blessed_edges;
        for (int i = 0; i < traversal.visit_size() - 1; ++i) {
            // For each node and the next node (which won't be the end)
            
            const Visit visit = traversal.visit(i);
            const Visit next = traversal.visit(i);
            
            // Find the edge between them
            NodeTraversal here(graph.get_node(visit.node_id()), visit.backward());
            NodeTraversal next_traversal(graph.get_node(next.node_id()), next.backward());
            Edge* edge = graph.get_edge(here, next_traversal);
            assert(edge != nullptr);
            
            // Remember we need it
            blessed_edges.insert(edge);
        }
        
        // Also get the edges from the boundary nodes into the traversal
        if (traversal.visit_size() > 0) {
            NodeTraversal first_visit = to_node_traversal(traversal.visit(0), graph);
            NodeTraversal last_visit = to_node_traversal(traversal.visit(traversal.visit_size() - 1),
                                                         graph);
            blessed_edges.insert(graph.get_edge(to_node_traversal(leaf->start(), graph), first_visit));
            blessed_edges.insert(graph.get_edge(last_visit, to_node_traversal(leaf->end(), graph)));
        }
        else {
            // This is a deletion traversal, so get the edge from the start to end of the site
            blessed_edges.insert(graph.get_edge(to_node_traversal(leaf->start(), graph),
                                                to_node_traversal(leaf->end(), graph)));
        }
        
        for (auto* edge : edges) {
            if (!blessed_edges.count(edge)) {
                // Get rid of all the edges not needed for the one true traversal
#ifdef debug
                cerr << to_node_traversal(leaf->start(), graph) << " - "
                     << to_node_traversal(leaf->end(), graph) << ": Delete edge: "
                     << pb2json(*edge) << endl;
#endif
                graph.destroy_edge(edge);
                deleted_edges++;
            }
        }
       
           
        // Now delete all the nodes that aren't on the blessed traversal.
        
        // What nodes are on it?
        set<Node*> blessed_nodes;
        for (int i = 0; i < traversal.visit_size(); i++) {
            const Visit& visit = traversal.visit(i);
            blessed_nodes.insert(graph.get_node(visit.node_id()));
        }
        
        for (auto* node : nodes) {
            // For every node in the site
            if (!blessed_nodes.count(node)) {
                // If we don't need it for the chosen path, destroy it
#ifdef debug
                cerr << to_node_traversal(leaf->start(), graph) << " - "
                     << to_node_traversal(leaf->end(), graph) << ": Delete node: "
                     << pb2json(*node) << endl;
#endif
                // There may be paths still touching this node, if they
                // managed to get into the site without touching the start
                // node. We'll delete those paths.
                set<string> paths_to_kill;
                for (auto& kv : graph.paths.get_node_mapping(node)) {
                
                    if (mappings_by_path.count(kv.first)) {
                        // We've already actually updated this path; the
                        // node_mapping data is just out of date.
                        continue;
                    }
                
                    paths_to_kill.insert(kv.first);
                }
                for (auto& path : paths_to_kill) {
                    graph.paths.remove_path(path);
                    cerr << "warning:[vg simplify] Path " << path << " removed" << endl;
                }

                graph.destroy_node(node);
                
                deleted_nodes++;
            }
        }
        
        // OK we finished a leaf
        increment_progress();
    }
    
    destroy_progress();
    
    // Reset the ranks in the graph, since we rewrote paths
    graph.paths.clear_mapping_ranks();
    
    // Return the statistics.
    return to_return;

}
Exemple #12
0
    Alignment Filter::depth_filter(Alignment& aln){
        if (use_avg && window_length != 0){

        }
        else if (use_avg != 0){

        }
        else{

        }

        Path path = aln.path();
        //TODO handle reversing mappings
        vector<int>* qual_window;
        if (window_length > 0){
            qual_window = new vector<int>();
        }

        for (int i = 0; i < path.mapping_size(); i++){
            Mapping mapping = path.mapping(i);
            Position start_pos = mapping.position();
            int64_t start_node = start_pos.node_id();
            int64_t start_offset = start_pos.offset();
            int64_t curr_offset_in_graph = 0;
            int64_t curr_offset_in_alignment = 0;
            stringstream pst;

            pst << start_node << "_" << curr_offset_in_graph;
            string p_hash = pst.str();
            for (int j = 0; j < mapping.edit_size(); j++){
                Edit ee = mapping.edit(j);
                if (ee.from_length() == ee.to_length() && ee.sequence() == ""){
                    if (!filter_matches){
                        continue;
                    }
                }
                stringstream est;
                est <<  ee.from_length() << "_" << ee.to_length() << "_" + ee.sequence();
                string e_hash = est.str();
#pragma omp critical(write)
                pos_to_edit_to_depth[p_hash][e_hash] += 1;
                /**
                 * If an edit fails the filter, either return a new empty alignment
                 * OR
                 * return a new alignment identical to the old one EXCEPT where
                 * the offending edit has been replaced by a match to the reference.
                 */
                if (pos_to_edit_to_depth[p_hash][e_hash] < min_depth){
                    if (!remove_failing_edits){
                        return inverse ? aln : Alignment();
                    }

                    else {
                        Alignment edited_aln = Alignment(aln);
                        edited_aln.mutable_path()->mutable_mapping(i)->mutable_edit(j)->set_sequence("");
                        edited_aln.mutable_path()->mutable_mapping(i)->mutable_edit(j)->set_from_length(ee.from_length());
                        edited_aln.mutable_path()->mutable_mapping(i)->mutable_edit(j)->set_to_length(ee.from_length());
                        return edited_aln;
                    }
                }
            }
            return inverse ? Alignment() : aln;
        }


    }