Ejemplo n.º 1
0
void Pileups::compute_from_alignment(VG& graph, Alignment& alignment) {
    // if we start reversed
    if (alignment.has_path() && alignment.path().mapping(0).position().is_reverse()) {
        alignment = reverse_alignment(alignment,
                                      (function<int64_t(int64_t)>) ([&graph](int64_t id) {
                                          return graph.get_node(id)->sequence().size();
                                          }));
    }
    const Path& path = alignment.path();
    int64_t read_offset = 0;
    for (int i = 0; i < path.mapping_size(); ++i) {
        const Mapping& mapping = path.mapping(i);
        if (graph.has_node(mapping.position().node_id())) {
            const Node* node = graph.get_node(mapping.position().node_id());
            NodePileup* pileup = get_create(node->id());
            int64_t node_offset = mapping.position().offset();
            for (int j = 0; j < mapping.edit_size(); ++j) {
                const Edit& edit = mapping.edit(j);
                // process all pileups in edit.
                // update the offsets as we go
                compute_from_edit(*pileup, node_offset, read_offset, *node,
                                  alignment, mapping, edit);
            }
        }
    }
    assert(alignment.sequence().empty() ||
           alignment.path().mapping_size() == 0 ||
           read_offset == alignment.sequence().length());
}
Ejemplo n.º 2
0
int main_concat(int argc, char** argv) {

    if (argc == 2) {
        help_concat(argv);
        return 1;
    }

    int c;
    optind = 2; // force optind past command positional argument
    while (true) {
        static struct option long_options[] =
        {
            {"help", no_argument, 0, 'h'},
            {0, 0, 0, 0}
        };

        int option_index = 0;
        c = getopt_long (argc, argv, "h",
                long_options, &option_index);

        // Detect the end of the options.
        if (c == -1)
            break;

        switch (c)
        {
            case 'h':
            case '?':
                help_concat(argv);
                exit(1);
                break;

            default:
                abort ();
        }
    }

    list<VG*> graphs;

    while (optind < argc) {
        VG* graph;
        get_input_file(optind, argc, argv, [&](istream& in) {
            graph = new VG(in);
        });
        graphs.push_back(graph);
    }

    VG merged;
    for (list<VG*>::iterator g = graphs.begin(); g != graphs.end(); ++g) {
        merged.append(**g);
    }

    // output
    merged.serialize_to_ostream(std::cout);

    return 0;
}
Ejemplo n.º 3
0
string Sampler::alignment_seq(const Alignment& aln) {
    // get the graph corresponding to the alignment path
    Graph sub;
    for (int i = 0; i < aln.path().mapping_size(); ++ i) {
        auto& m = aln.path().mapping(i);
        if (m.has_position() && m.position().node_id()) {
            auto id = aln.path().mapping(i).position().node_id();
            xgidx->neighborhood(id, 2, sub);
        }
    }
    VG g; g.extend(sub);
    return g.path_string(aln.path());
}
Ejemplo n.º 4
0
void VGset::transform(std::function<void(VG*)> lambda) {
    for (auto& name : filenames) {
        // load
        VG* g = NULL;
        if (name == "-") {
            g = new VG(std::cin, show_progress);
        } else {
            ifstream in(name.c_str());
            g = new VG(in, show_progress);
            in.close();
        }
        g->name = name;
        // apply
        lambda(g);
        // write to the same file
        ofstream out(name.c_str());
        g->serialize_to_ostream(out);
        out.close();
        delete g;
    }
}
Ejemplo n.º 5
0
	VG handle_to_vg(const HandleGraph* xg) {
		// If xg is a null pointer, throw a runtime error
		if (xg == nullptr) {
			throw runtime_error("There is no xg to convert"); 
		} 
		// Initialize the VG graph
		VG vg;
		// Iterate through each handle in xg and create the same handle in vg
		xg->for_each_handle([&](const handle_t& here) {
			// Get the id of the xg handle
			id_t xg_id = xg->get_id(here);
			// Get the sequence of the xg handle
			string xg_seq = xg->get_sequence(here);
			// Create a handle in vg using the xg id and sequence
			vg.create_handle(xg_seq,xg_id);
		});
		// Iterate through each handle in xg 
		xg->for_each_handle([&](const handle_t& handle) {
			id_t id = xg->get_id(handle);
			bool rev = xg->get_is_reverse(handle);
			// Return a vg handle using the xg handle's id and orientation 
			handle_t current = vg.get_handle(id,rev);
			// Follow the right edges of the xg handle
			xg->follow_edges(handle, false, [&](const handle_t& r) {
				id_t id_r = xg->get_id(r);
				bool rev_r = xg->get_is_reverse(r);
				// Return a vg handle using the xg handle's id and orientation
				handle_t next = vg.get_handle(id_r, rev_r);
				// Create an edge in vg using the handles 
				vg.create_edge(current,next);
			});
			// Follow the left edges of the xg handle
			xg->follow_edges(handle, true, [&](const handle_t& l) {
				id_t id_l = xg->get_id(l);
				bool rev_l = xg->get_is_reverse(l);
				// Return a vg handle using the xg handle's id and orientation
				handle_t prev = vg.get_handle(id_l, rev_l);
				// Use the handles created from following the xg edges to create a vg edge
				vg.create_edge(prev,current); //error here
			});
		});
		return vg;
	}
Ejemplo n.º 6
0
// add all node traversals that valid walks from this one onto a stack
void stack_up_valid_walks(VG& graph, NodeTraversal walk_head, vector<NodeTraversal>& stack) {
    
    id_t head_id = walk_head.node->id();
    
    if (walk_head.backward) {
        // we are leaving from the start of the node
        
        // get all edges involving this node so we can filter them down to valid walks
        for (Edge* edge : graph.edges_of(walk_head.node)) {
            if (edge->from() == head_id && edge->from_start()) {
                // the edge is part of a valid walk
                Node* next_node = graph.get_node(edge->to());
                bool next_backward = edge->to_end();
                // add the next traversal in the walk to the stack
                stack.push_back(NodeTraversal(next_node, next_backward));
            }
            else if (edge->to() == head_id && !edge->to_end()) {
                // the edge is part of a valid walk in the opposite orientation
                Node* next_node = graph.get_node(edge->from());
                bool next_backward = edge->from_start();
                // add the next traversal in the walk to the stack
                stack.push_back(NodeTraversal(next_node, next_backward));
            }
        }
    }
    else {
        // we are leaving from the end of the node
        
        // get all edges involving this node so we can filter them down to valid walks
        for (Edge* edge : graph.edges_of(walk_head.node)) {
            if (edge->from() == head_id && !edge->from_start()) {
                // the edge is part of a valid walk
                Node* next_node = graph.get_node(edge->to());
                bool next_backward = edge->to_end();
                // add the next traversal in the walk to the stack
                stack.push_back(NodeTraversal(next_node, next_backward));
            }
            else if (edge->to() == head_id && edge->to_end()) {
                // the edge is part of a valid walk in the opposite orientation
                Node* next_node = graph.get_node(edge->from());
                bool next_backward = edge->from_start();
                // add the next traversal in the walk to the stack
                stack.push_back(NodeTraversal(next_node, next_backward));
            }
        }
    }
}
Ejemplo n.º 7
0
int main_validate(int argc, char** argv) {

    if (argc <= 2) {
        help_validate(argv);
        return 1;
    }

    bool check_nodes = false;
    bool check_edges = false;
    bool check_orphans = false;
    bool check_paths = false;
    string xg_path;
    string gam_path;

    int c;
    optind = 2; // force optind past command positional argument
    while (true) {
        static struct option long_options[] =
        {
            {"help", no_argument, 0, 'h'},
            {"nodes", no_argument, 0, 'n'},
            {"edges", no_argument, 0, 'e'},
            {"paths", no_argument, 0, 'o'},
            {"orphans", no_argument, 0, 'p'},
            {"gam", required_argument, 0, 'a'},
            {"xg", required_argument, 0, 'x'},
            {0, 0, 0, 0}
        };

        int option_index = 0;
        c = getopt_long (argc, argv, "hneopa:x:",
                long_options, &option_index);

        // Detect the end of the options.
        if (c == -1)
            break;

        switch (c)
        {

            case 'n':
                check_nodes = true;
                break;

            case 'e':
                check_edges = true;
                break;

            case 'o':
                check_orphans = true;
                break;

            case 'p':
                check_paths = true;
                break;

            case 'a':
                gam_path = optarg;
                break;

            case 'x':
                xg_path= optarg;
                break;

            case 'h':
            case '?':
                help_validate(argv);
                exit(1);
                break;

            default:
                abort ();
        }
    }

    if (!gam_path.empty() || !xg_path.empty()) {
        // GAM validation is its entirely own thing
        if (xg_path.empty()) {
            cerr << "error:[vg validate] xg index (-x) required with (-a)" << endl;
            return 1;
        } else if (gam_path.empty()) {
            cerr << "error:[vg validate] gam alignment (-a) required with (-x)" << endl;
            return 1;
        } else if (check_nodes || check_edges || check_orphans || check_paths) {
            cerr << "error:[vg validate] -n, -e -o, -p cannot be used with -a and -x" << endl;
            return 1;
        }
        ifstream in(xg_path.c_str());
        unique_ptr<xg::XG> xindex = stream::VPKG::load_one<xg::XG>(in);
        in.close();
        get_input_file(gam_path, [&](istream& in) {
                stream::for_each<Alignment>(in, [&](Alignment& aln) {
                        if (!alignment_is_valid(aln, xindex.get())) {
                            exit(1);
                        }
                    });
            });
        return 0;
    } else {

        VG* graph;
        get_input_file(optind, argc, argv, [&](istream& in) {
                graph = new VG(in);
            });

        // if we chose a specific subset, do just them
        if (check_nodes || check_edges || check_orphans || check_paths) {
            if (graph->is_valid(check_nodes, check_edges, check_orphans, check_paths)) {
                return 0;
            } else {
                return 1;
            }
            // otherwise do everything
        } else if (graph->is_valid()) {
            return 0;
        } else {
            return 1;
        }
    }
}
Ejemplo n.º 8
0
int main_find(int argc, char** argv) {

    if (argc == 2) {
        help_find(argv);
        return 1;
    }

    string db_name;
    string sequence;
    int kmer_size=0;
    int kmer_stride = 1;
    vector<string> kmers;
    vector<vg::id_t> node_ids;
    string node_list_file;
    int context_size=0;
    bool use_length = false;
    bool count_kmers = false;
    bool kmer_table = false;
    vector<string> targets;
    string path_name;
    bool position_in = false;
    bool rank_in = false;
    string range;
    string gcsa_in;
    string xg_name;
    bool get_mems = false;
    int mem_reseed_length = 0;
    bool use_fast_reseed = true;
    bool get_alignments = false;
    bool get_mappings = false;
    string node_id_range;
    string aln_on_id_range;
    vg::id_t start_id = 0;
    vg::id_t end_id = 0;
    bool pairwise_distance = false;
    string haplotype_alignments;
    string gam_file;
    int max_mem_length = 0;
    int min_mem_length = 1;
    string to_graph_file;
    bool extract_threads = false;
    vector<string> extract_patterns;
    vg::id_t approx_id = 0;

    int c;
    optind = 2; // force optind past command positional argument
    while (true) {
        static struct option long_options[] =
            {
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"db-name", required_argument, 0, 'd'},
                {"xg-name", required_argument, 0, 'x'},
                {"gcsa", required_argument, 0, 'g'},
                {"node", required_argument, 0, 'n'},
                {"node-list", required_argument, 0, 'N'},
                {"edges-end", required_argument, 0, 'e'},
                {"edges-start", required_argument, 0, 's'},
                {"kmer", required_argument, 0, 'k'},
                {"table", no_argument, 0, 'T'},
                {"sequence", required_argument, 0, 'S'},
                {"mems", required_argument, 0, 'M'},
                {"reseed-length", required_argument, 0, 'B'},
                {"fast-reseed", no_argument, 0, 'f'},
                {"kmer-stride", required_argument, 0, 'j'},
                {"kmer-size", required_argument, 0, 'z'},
                {"context", required_argument, 0, 'c'},
                {"use-length", no_argument, 0, 'L'},
                {"kmer-count", no_argument, 0, 'C'},
                {"path", required_argument, 0, 'p'},
                {"position-in", required_argument, 0, 'P'},
                {"rank-in", required_argument, 0, 'R'},
                {"node-range", required_argument, 0, 'r'},
                {"alignments", no_argument, 0, 'a'},
                {"mappings", no_argument, 0, 'm'},
                {"alns-in", required_argument, 0, 'i'},
                {"alns-on", required_argument, 0, 'o'},
                {"distance", no_argument, 0, 'D'},
                {"haplotypes", required_argument, 0, 'H'},
                {"gam", required_argument, 0, 'G'},
                {"to-graph", required_argument, 0, 'A'},
                {"max-mem", required_argument, 0, 'Y'},
                {"min-mem", required_argument, 0, 'Z'},
                {"extract-threads", no_argument, 0, 't'},
                {"threads-named", required_argument, 0, 'q'},
                {"approx-pos", required_argument, 0, 'X'},
                {0, 0, 0, 0}
            };

        int option_index = 0;
        c = getopt_long (argc, argv, "d:x:n:e:s:o:k:hc:LS:z:j:CTp:P:r:amg:M:R:B:fi:DH:G:N:A:Y:Z:tq:X:",
                         long_options, &option_index);

        // Detect the end of the options.
        if (c == -1)
            break;

        switch (c)
        {
        case 'd':
            db_name = optarg;
            break;

        case 'x':
            xg_name = optarg;
            break;

        case 'g':
            gcsa_in = optarg;
            break;

        case 'k':
            kmers.push_back(optarg);
            break;

        case 'S':
            sequence = optarg;
            break;

        case 'M':
            sequence = optarg;
            get_mems = true;
            break;
            
        case 'B':
            mem_reseed_length = atoi(optarg);
            break;
            
        case 'f':
            use_fast_reseed = true;
            break;

        case 'Y':
            max_mem_length = atoi(optarg);
            break;
            
        case 'Z':
            min_mem_length = atoi(optarg);
            break;
            
        case 'j':
            kmer_stride = atoi(optarg);
            break;

        case 'z':
            kmer_size = atoi(optarg);
            break;

        case 'C':
            count_kmers = true;
            break;

        case 'p':
            targets.push_back(optarg);
            break;

        case 'P':
            path_name = optarg;
            position_in = true;
            break;

        case 'R':
            path_name = optarg;
            rank_in = true;
            break;

        case 'c':
            context_size = atoi(optarg);
            break;

        case 'L':
            use_length = true;
            break;

        case 'n':
            node_ids.push_back(atoi(optarg));
            break;

        case 'N':
            node_list_file = optarg;
            break;

        case 'e':
            end_id = atoi(optarg);
            break;

        case 's':
            start_id = atoi(optarg);
            break;

        case 'T':
            kmer_table = true;
            break;

        case 'r':
            range = optarg;
            break;

        case 'a':
            get_alignments = true;
            break;

        case 'i':
            node_id_range = optarg;
            break;

        case 'm':
            get_mappings = true;
            break;

        case 'o':
            aln_on_id_range = optarg;
            break;

        case 'D':
            pairwise_distance = true;
            break;

        case 'H':
            haplotype_alignments = optarg;
            break;

        case 't':
            extract_threads = true;
            break;

        case 'q':
            extract_threads = true;
            extract_patterns.push_back(optarg);
            break;

        case 'X':
            approx_id = atoi(optarg);
            break;

        case 'G':
            gam_file = optarg;
            break;

        case 'A':
            to_graph_file = optarg;
            break;

        case 'h':
        case '?':
            help_find(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }
    if (optind < argc) {
        cerr << "[vg find] find does not accept positional arguments" << endl;
        return 1;
    }

    if (db_name.empty() && gcsa_in.empty() && xg_name.empty()) {
        cerr << "[vg find] find requires -d, -g, or -x to know where to find its database" << endl;
        return 1;
    }

    if (context_size > 0 && use_length == true && xg_name.empty()) {
        cerr << "[vg find] error, -L not supported without -x" << endl;
        exit(1);
    }
    
    if (xg_name.empty() && mem_reseed_length) {
        cerr << "error:[vg find] SMEM reseeding requires an XG index. Provide XG index with -x." << endl;
        exit(1);
    }
    
    // process input node list
    if (!node_list_file.empty()) {
        ifstream nli;
        nli.open(node_list_file);
        if (!nli.good()){
            cerr << "[vg find] error, unable to open the node list input file." << endl;
            exit(1);
        }
        string line;
        while (getline(nli, line)){
            for (auto& idstr : split_delims(line, " \t")) {
                node_ids.push_back(atol(idstr.c_str()));
            }
        }
        nli.close();
    }

    // open index
    Index* vindex = nullptr;
    if (db_name.empty()) {
        assert(!gcsa_in.empty() || !xg_name.empty());
    } else {
        vindex = new Index;
        vindex->open_read_only(db_name);
    }

    xg::XG xindex;
    if (!xg_name.empty()) {
        ifstream in(xg_name.c_str());
        xindex.load(in);
    }

    if (get_alignments) {
        assert(!db_name.empty());
        vector<Alignment> output_buf;
        auto lambda = [&output_buf](const Alignment& aln) {
            output_buf.push_back(aln);
            stream::write_buffered(cout, output_buf, 100);
        };
        vindex->for_each_alignment(lambda);
        stream::write_buffered(cout, output_buf, 0);
    }

    if (!node_id_range.empty()) {
        assert(!db_name.empty());
        vector<string> parts = split_delims(node_id_range, ":");
        if (parts.size() == 1) {
            convert(parts.front(), start_id);
            end_id = start_id;
        } else {
            convert(parts.front(), start_id);
            convert(parts.back(), end_id);
        }
        vector<Alignment> output_buf;
        auto lambda = [&output_buf](const Alignment& aln) {
            output_buf.push_back(aln);
            stream::write_buffered(cout, output_buf, 100);
        };
        vindex->for_alignment_in_range(start_id, end_id, lambda);
        stream::write_buffered(cout, output_buf, 0);
    }

    if (!aln_on_id_range.empty()) {
        assert(!db_name.empty());
        vector<string> parts = split_delims(aln_on_id_range, ":");
        if (parts.size() == 1) {
            convert(parts.front(), start_id);
            end_id = start_id;
        } else {
            convert(parts.front(), start_id);
            convert(parts.back(), end_id);
        }
        vector<vg::id_t> ids;
        for (auto i = start_id; i <= end_id; ++i) {
            ids.push_back(i);
        }
        vector<Alignment> output_buf;
        auto lambda = [&output_buf](const Alignment& aln) {
            output_buf.push_back(aln);
            stream::write_buffered(cout, output_buf, 100);
        };
        vindex->for_alignment_to_nodes(ids, lambda);
        stream::write_buffered(cout, output_buf, 0);
    }

    if (!to_graph_file.empty()) {
        assert(vindex != nullptr);
        ifstream tgi(to_graph_file);
        VG graph(tgi);
        vector<vg::id_t> ids;
        graph.for_each_node([&](Node* n) { ids.push_back(n->id()); });
        vector<Alignment> output_buf;
        auto lambda = [&output_buf](const Alignment& aln) {
            output_buf.push_back(aln);
            stream::write_buffered(cout, output_buf, 100);
        };
        vindex->for_alignment_to_nodes(ids, lambda);
        stream::write_buffered(cout, output_buf, 0);
    }

    if (!xg_name.empty()) {
        if (!node_ids.empty() && path_name.empty() && !pairwise_distance) {
            // get the context of the node
            vector<Graph> graphs;
            set<vg::id_t> ids;
            for (auto node_id : node_ids) ids.insert(node_id);
            for (auto node_id : node_ids) {
                Graph g;
                xindex.neighborhood(node_id, context_size, g, !use_length);
                if (context_size == 0) {
                    for (auto& edge : xindex.edges_of(node_id)) {
                        // if both ends of the edge are in our targets, keep them
                        if (ids.count(edge.to()) && ids.count(edge.from())) {
                            *g.add_edge() = edge;
                        }
                    }
                }
                graphs.push_back(g);
            }
            VG result_graph;
            for (auto& graph : graphs) {
                // Allow duplicate nodes and edges (from e.g. multiple -n options); silently collapse them.
                result_graph.extend(graph);
            }
            result_graph.remove_orphan_edges();
            
            // Order the mappings by rank. TODO: how do we handle breaks between
            // different sections of a path with a single name?
            result_graph.paths.sort_by_mapping_rank();
            
            // return it
            result_graph.serialize_to_ostream(cout);
        } else if (end_id != 0) {
            for (auto& e : xindex.edges_on_end(end_id)) {
                cout << (e.from_start() ? -1 : 1) * e.from() << "\t" <<  (e.to_end() ? -1 : 1) * e.to() << endl;
            }
        } else if (start_id != 0) {
            for (auto& e : xindex.edges_on_start(start_id)) {
                cout << (e.from_start() ? -1 : 1) * e.from() << "\t" <<  (e.to_end() ? -1 : 1) * e.to() << endl;
            }
        }
        if (!node_ids.empty() && !path_name.empty() && !pairwise_distance && (position_in || rank_in)) {
            // Go get the positions of these nodes in this path
            
            if (xindex.path_rank(path_name) == 0) {
                // This path doesn't exist, and we'll get a segfault or worse if
                // we go look for positions in it.
                cerr << "[vg find] error, path \"" << path_name << "\" not found in index" << endl;
                exit(1);
            }
            
            // Note: this isn't at all consistent with -P option with rocksdb, which couts a range
            // and then mapping, but need this info right now for scripts/chunked_call
            for (auto node_id : node_ids) {
                cout << node_id;
                for (auto r : (position_in ? xindex.position_in_path(node_id, path_name)
                               : xindex.node_ranks_in_path(node_id, path_name))) {
                    cout << "\t" << r;
                }
                cout << endl;
            }
        }
        if (pairwise_distance) {
            if (node_ids.size() != 2) {
                cerr << "[vg find] error, exactly 2 nodes (-n) required with -D" << endl;
                exit(1);
            }
            cout << xindex.min_approx_path_distance(node_ids[0], node_ids[1]) << endl;
            return 0;
        }
        if (approx_id != 0) {
            cout << xindex.node_start(approx_id) << endl;
            return 0;
        }
        if (!targets.empty()) {
            Graph graph;
            for (auto& target : targets) {
                // Grab each target region
                string name;
                int64_t start, end;
                xg::parse_region(target, name, start, end);
                if(xindex.path_rank(name) == 0) {
                    // Passing a nonexistent path to get_path_range produces Undefined Behavior
                    cerr << "[vg find] error, path " << name << " not found in index" << endl;
                    exit(1);
                }
                // no coordinates given, we do whole thing (0,-1)
                if (start < 0 && end < 0) {
                    start = 0;
                }
                xindex.get_path_range(name, start, end, graph);
            }
            if (context_size > 0) {
                xindex.expand_context(graph, context_size, true, !use_length);
            }
            VG vgg; vgg.extend(graph); // removes dupes
            
            // Order the mappings by rank. TODO: how do we handle breaks between
            // different sections of a path with a single name?
            vgg.paths.sort_by_mapping_rank();
            
            vgg.serialize_to_ostream(cout);
        }
        if (!range.empty()) {
            Graph graph;
            int64_t id_start=0, id_end=0;
            vector<string> parts = split_delims(range, ":");
            if (parts.size() == 1) {
                cerr << "[vg find] error, format of range must be \"N:M\" where start id is N and end id is M, got " << range << endl;
                exit(1);
            }
            convert(parts.front(), id_start);
            convert(parts.back(), id_end);
            if (!use_length) {
                xindex.get_id_range(id_start, id_end, graph);
            } else {
                // treat id_end as length instead.
                xindex.get_id_range_by_length(id_start, id_end, graph, true);
            }
            if (context_size > 0) {
                xindex.expand_context(graph, context_size, true, !use_length);
            }
            VG vgg; vgg.extend(graph); // removes dupes
            vgg.remove_orphan_edges();
            vgg.serialize_to_ostream(cout);
        }
        if(!haplotype_alignments.empty()) {
            // What should we do with each alignment?
            function<void(Alignment&)> lambda = [&xindex](Alignment& aln) {
                // Count the amtches to the path. The path might be empty, in
                // which case it will yield the biggest size_t you can have.
                size_t matches = xindex.count_matches(aln.path());

                // We do this single-threaded, at least for now, so we don't
                // need to worry about coordinating output, and we can just
                // spit out the counts as bare numbers.
                cout << matches << endl;
            };
            if (haplotype_alignments == "-") {
                stream::for_each(std::cin, lambda);
            } else {
                ifstream in;
                in.open(haplotype_alignments.c_str());
                if(!in.is_open()) {
                    cerr << "[vg find] error: could not open alignments file " << haplotype_alignments << endl;
                    exit(1);
                }
                stream::for_each(in, lambda);
            }

        }
        if (extract_threads) {
            size_t thread_number = 0;
            bool extract_reverse = false;
            map<string, list<xg::XG::thread_t> > threads;
            if (extract_patterns.empty()) {
                threads = xindex.extract_threads(extract_reverse);
            } else {
                for (auto& pattern : extract_patterns) {
                    for (auto& t : xindex.extract_threads_matching(pattern, extract_reverse)) {
                        threads[t.first] = t.second;
                    }
                }
            }
            for(auto t : threads) {
                // Convert to a Path
                auto& thread = *t.second.begin();
                auto& thread_name = t.first;
                Path path;
                for(xg::XG::ThreadMapping& m : thread) {
                    // Convert all the mappings
                    Mapping mapping;
                    mapping.mutable_position()->set_node_id(m.node_id);
                    mapping.mutable_position()->set_is_reverse(m.is_reverse);
                    
                    *(path.add_mapping()) = mapping;
                }

                // Get each thread's name
                path.set_name(thread_name);
                // Give each thread a name
                //path.set_name("_thread_" + to_string(thread_number++));

                // We need a Graph for serialization purposes. We do one chunk per
                // thread in case the threads are long.
                Graph g;
                *(g.add_path()) = path;

                // Dump the graph with its mappings. TODO: can we restrict these to
                vector<Graph> gb = { g };
                stream::write_buffered(cout, gb, 0);
            }
        }
        if (!gam_file.empty()) {
            set<vg::id_t> nodes;
            function<void(Alignment&)> lambda = [&nodes](Alignment& aln) {
                // accumulate nodes matched by the path
                auto& path = aln.path();
                for (int i = 0; i < path.mapping_size(); ++i) {
                    nodes.insert(path.mapping(i).position().node_id());
                }
            };
            if (gam_file == "-") {
                stream::for_each(std::cin, lambda);
            } else {
                ifstream in;
                in.open(gam_file.c_str());
                if(!in.is_open()) {
                    cerr << "[vg find] error: could not open alignments file " << gam_file << endl;
                    exit(1);
                }
                stream::for_each(in, lambda);
            }
            // now we have the nodes to get
            Graph graph;
            for (auto& node : nodes) {
                *graph.add_node() = xindex.node(node);
            }
            xindex.expand_context(graph, max(1, context_size), true); // get connected edges
            VG vgg; vgg.extend(graph);
            vgg.serialize_to_ostream(cout);
        }
    } else if (!db_name.empty()) {
        if (!node_ids.empty() && path_name.empty()) {
            // get the context of the node
            vector<VG> graphs;
            for (auto node_id : node_ids) {
                VG g;
                vindex->get_context(node_id, g);
                if (context_size > 0) {
                    vindex->expand_context(g, context_size);
                }
                graphs.push_back(g);
            }
            VG result_graph;
            for (auto& graph : graphs) {
                // Allow duplicate nodes and edges (from e.g. multiple -n options); silently collapse them.
                result_graph.extend(graph);
            }
            result_graph.remove_orphan_edges();
            // return it
            result_graph.serialize_to_ostream(cout);
        } else if (end_id != 0) {
            vector<Edge> edges;
            vindex->get_edges_on_end(end_id, edges);
            for (vector<Edge>::iterator e = edges.begin(); e != edges.end(); ++e) {
                cout << (e->from_start() ? -1 : 1) * e->from() << "\t" <<  (e->to_end() ? -1 : 1) * e->to() << endl;
            }
        } else if (start_id != 0) {
            vector<Edge> edges;
            vindex->get_edges_on_start(start_id, edges);
            for (vector<Edge>::iterator e = edges.begin(); e != edges.end(); ++e) {
                cout << (e->from_start() ? -1 : 1) * e->from() << "\t" <<  (e->to_end() ? -1 : 1) * e->to() << endl;
            }
        }
        if (!node_ids.empty() && !path_name.empty()) {
            int64_t path_id = vindex->get_path_id(path_name);
            for (auto node_id : node_ids) {
                list<pair<int64_t, bool>> path_prev, path_next;
                int64_t prev_pos=0, next_pos=0;
                bool prev_backward, next_backward;
                if (vindex->get_node_path_relative_position(node_id, false, path_id,
                            path_prev, prev_pos, prev_backward,
                            path_next, next_pos, next_backward)) {

                    // Negate IDs for backward nodes
                    cout << node_id << "\t" << path_prev.front().first * (path_prev.front().second ? -1 : 1) << "\t" << prev_pos
                        << "\t" << path_next.back().first * (path_next.back().second ? -1 : 1) << "\t" << next_pos << "\t";

                    Mapping m = vindex->path_relative_mapping(node_id, false, path_id,
                            path_prev, prev_pos, prev_backward,
                            path_next, next_pos, next_backward);
                    cout << pb2json(m) << endl;
                }
            }
        }
        if (!targets.empty()) {
            VG graph;
            for (auto& target : targets) {
                string name;
                int64_t start, end;
                xg::parse_region(target, name, start, end);
                // end coordinate is exclusive for get_path()
                if (end >= 0) {
                    ++end;
                }
                vindex->get_path(graph, name, start, end);
            }
            if (context_size > 0) {
                vindex->expand_context(graph, context_size);
            }
            graph.remove_orphan_edges();
            graph.serialize_to_ostream(cout);
        }
        if (!range.empty()) {
            VG graph;
            int64_t id_start=0, id_end=0;
            vector<string> parts = split_delims(range, ":");
            if (parts.size() == 1) {
                cerr << "[vg find] error, format of range must be \"N:M\" where start id is N and end id is M, got " << range << endl;
                exit(1);
            }
            convert(parts.front(), id_start);
            convert(parts.back(), id_end);
            vindex->get_range(id_start, id_end, graph);
            if (context_size > 0) {
                vindex->expand_context(graph, context_size);
            }
            graph.remove_orphan_edges();
            graph.serialize_to_ostream(cout);
        }
    }

    // todo cleanup if/else logic to allow only one function

    if (!sequence.empty()) {
        if (gcsa_in.empty()) {
            if (get_mems) {
                cerr << "error:[vg find] a GCSA index must be passed to get MEMs" << endl;
                return 1;
            }
            set<int> kmer_sizes = vindex->stored_kmer_sizes();
            if (kmer_sizes.empty()) {
                cerr << "error:[vg find] index does not include kmers, add with vg index -k" << endl;
                return 1;
            }
            if (kmer_size == 0) {
                kmer_size = *kmer_sizes.begin();
            }
            for (int i = 0; i <= sequence.size()-kmer_size; i+=kmer_stride) {
                kmers.push_back(sequence.substr(i,kmer_size));
            }
        } else {
            // let's use the GCSA index

            // Configure GCSA2 verbosity so it doesn't spit out loads of extra info
            gcsa::Verbosity::set(gcsa::Verbosity::SILENT);
            
            // Configure its temp directory to the system temp directory
            gcsa::TempFile::setDirectory(find_temp_dir());

            // Open it
            ifstream in_gcsa(gcsa_in.c_str());
            gcsa::GCSA gcsa_index;
            gcsa_index.load(in_gcsa);
            gcsa::LCPArray lcp_index;
            // default LCP is the gcsa base name +.lcp
            string lcp_in = gcsa_in + ".lcp";
            ifstream in_lcp(lcp_in.c_str());
            lcp_index.load(in_lcp);
            //range_type find(const char* pattern, size_type length) const;
            //void locate(size_type path, std::vector<node_type>& results, bool append = false, bool sort = true) const;
            //locate(i, results);
            if (!get_mems) {
                auto paths = gcsa_index.find(sequence.c_str(), sequence.length());
                //cerr << paths.first << " - " << paths.second << endl;
                for (gcsa::size_type i = paths.first; i <= paths.second; ++i) {
                    std::vector<gcsa::node_type> ids;
                    gcsa_index.locate(i, ids);
                    for (auto id : ids) {
                        cout << gcsa::Node::decode(id) << endl;
                    }
                }
            } else {
                // for mems we need to load up the gcsa and lcp structures into the mapper
                Mapper mapper(&xindex, &gcsa_index, &lcp_index);
                mapper.fast_reseed = use_fast_reseed;
                // get the mems
                double lcp_max, fraction_filtered;
                auto mems = mapper.find_mems_deep(sequence.begin(), sequence.end(), lcp_max, fraction_filtered, max_mem_length, min_mem_length, mem_reseed_length);

                // dump them to stdout
                cout << mems_to_json(mems) << endl;

            }
        }
    }

    if (!kmers.empty()) {
        if (count_kmers) {
            for (auto& kmer : kmers) {
                cout << kmer << "\t" << vindex->approx_size_of_kmer_matches(kmer) << endl;
            }
        } else if (kmer_table) {
            for (auto& kmer : kmers) {
                map<string, vector<pair<int64_t, int32_t> > > positions;
                vindex->get_kmer_positions(kmer, positions);
                for (auto& k : positions) {
                    for (auto& p : k.second) {
                        cout << k.first << "\t" << p.first << "\t" << p.second << endl;
                    }
                }
            }
        } else {
            vector<VG> graphs;
            for (auto& kmer : kmers) {
                VG g;
                vindex->get_kmer_subgraph(kmer, g);
                if (context_size > 0) {
                    vindex->expand_context(g, context_size);
                }
                graphs.push_back(g);
            }

            VG result_graph;
            for (auto& graph : graphs) {
                // Allow duplicate nodes and edges (from multiple kmers); silently collapse them.
                result_graph.extend(graph);
            }
            result_graph.remove_orphan_edges();
            result_graph.serialize_to_ostream(cout);
        }
    }

    if (vindex) delete vindex;

    return 0;

}
Ejemplo n.º 9
0
int main_xg(int argc, char** argv) {

    if (argc == 2) {
        help_xg(argv);
        return 1;
    }

    string vg_in;
    string vg_out;
    string out_name;
    string in_name;
    int64_t node_id;
    bool edges_from = false;
    bool edges_to = false;
    bool edges_of = false;
    bool edges_on_start = false;
    bool edges_on_end = false;
    bool node_sequence = false;
    string pos_for_char;
    string pos_for_substr;
    int context_steps = 0;
    bool node_context = false;
    string target;
    bool print_graph = false;
    bool text_output = false;
    bool validate_graph = false;
    bool extract_threads = false;
    bool store_threads = false;
    bool is_sorted_dag = false;
    string report_name;
    string b_array_name;
    
    int c;
    optind = 2; // force optind past "xg" positional argument
    while (true) {
        static struct option long_options[] =
            {
                {"help", no_argument, 0, 'h'},
                {"vg", required_argument, 0, 'v'},
                {"out", required_argument, 0, 'o'},
                {"in", required_argument, 0, 'i'},
                {"extract-vg", required_argument, 0, 'X'},
                {"node", required_argument, 0, 'n'},
                {"char", required_argument, 0, 'P'},
                {"substr", required_argument, 0, 'F'},
                //{"range", required_argument, 0, 'r'},
                {"context", required_argument, 0, 'c'},
                {"edges-from", required_argument, 0, 'f'},
                {"edges-to", required_argument, 0, 't'},
                {"edges-of", required_argument, 0, 'O'},
                {"edges-on-start", required_argument, 0, 'S'},
                {"edges-on-end", required_argument, 0, 'E'},
                {"node-seq", required_argument, 0, 's'},
                {"path", required_argument, 0, 'p'},
                {"extract-threads", no_argument, 0, 'x'},
                {"store-threads", no_argument, 0, 'r'},
                {"is-sorted-dag", no_argument, 0, 'd'},
                {"report", required_argument, 0, 'R'},
                {"debug", no_argument, 0, 'D'},
                {"text-output", no_argument, 0, 'T'},
                {"validate", no_argument, 0, 'V'},
                {"dump-bs", required_argument, 0, 'b'},
                {0, 0, 0, 0}
            };

        int option_index = 0;
        c = getopt_long (argc, argv, "hv:o:i:X:f:t:s:c:n:p:DxrdTO:S:E:VR:P:F:b:",
                         long_options, &option_index);

        // Detect the end of the options.
        if (c == -1)
            break;

        switch (c)
        {

        case 'v':
            vg_in = optarg;
            break;

        case 'V':
            validate_graph = true;
            break;

        case 'o':
            out_name = optarg;
            break;

        case 'D':
            print_graph = true;
            break;

        case 'T':
            text_output = true;
            break;
            
        case 'x':
            extract_threads = true;
            break;
            
        case 'r':
            store_threads = true;
            break;
            
        case 'd':
            is_sorted_dag = true;
            break;

        case 'i':
            in_name = optarg;
            break;

        case 'X':
            vg_out = optarg;
            break;

        case 'n':
            node_id = parse<int64_t>(optarg);
            node_context = true;
            break;

        case 'c':
            context_steps = parse<int>(optarg);
            break;

        case 'f':
            node_id = parse<int64_t>(optarg);
            edges_from = true;
            break;
            
        case 't':
            node_id = parse<int64_t>(optarg);
            edges_to = true;
            break;

        case 'O':
            node_id = parse<int64_t>(optarg);
            edges_of = true;
            break;

        case 'S':
            node_id = parse<int64_t>(optarg);
            edges_on_start = true;
            break;

        case 'E':
            node_id = parse<int64_t>(optarg);
            edges_on_end = true;
            break;

        case 's':
            node_id = parse<int64_t>(optarg);
            node_sequence = true;
            break;

        case 'p':
            target = optarg;
            break;

        case 'P':
            pos_for_char = optarg;
            break;
            
        case 'F':
            pos_for_substr = optarg;
            break;
            
        case 'R':
            report_name = optarg;
            break;
            
        case 'b':
            b_array_name = optarg;
            break;
            
        case 'h':
        case '?':
            help_xg(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    unique_ptr<XG> graph;
    //string file_name = argv[optind];
    if (in_name.empty()) assert(!vg_in.empty());
    if (vg_in == "-") {
        // Read VG from stdin
        graph = unique_ptr<XG>(new XG());
        graph->from_stream(std::cin, validate_graph, print_graph, store_threads, is_sorted_dag);
    } else if (vg_in.size()) {
        // Read VG from a file
        ifstream in;
        in.open(vg_in.c_str());
        graph = unique_ptr<XG>(new XG());
        graph->from_stream(in, validate_graph, print_graph, store_threads, is_sorted_dag);
    }

    if (in_name.size()) {
        get_input_file(in_name, [&](istream& in) {
            // Load from an XG file or - (stdin)
            graph = stream::VPKG::load_one<XG>(in);
        });
    }

    // Prepare structure tree for serialization
    unique_ptr<sdsl::structure_tree_node> structure;
    
    if (!report_name.empty()) {
        // We need to make a report, so we need the structure. Make a real tree
        // node. The unique_ptr handles deleting.
        structure = unique_ptr<sdsl::structure_tree_node>(new sdsl::structure_tree_node("name", "type"));
    }

    if(!vg_out.empty()) {
        if (graph.get() == nullptr) {
             cerr << "error [vg xg] no xg graph exists to convert; Try: vg xg -i graph.xg -X graph.vg" << endl;
             return 1;
        }
        
        VG converted;
        // Convert the xg graph to vg format
        convert_handle_graph(graph.get(), &converted);
        
        // TODO: The converter doesn't copy circular paths yet.
        // When it does, we can remove all this path copying code.

        // Make a raw Proto Graph to hold Path objects
        Graph path_graph;

        // Since paths are not copied, copy the paths.
        for (size_t rank = 1; rank <= graph->max_path_rank(); rank++) {
            // Extract each path into the path graph
            *path_graph.add_path() = graph->path(graph->path_name(rank));
        }

        // Merge in all the paths
        converted.extend(path_graph);
        
        if (vg_out == "-") {
            converted.serialize_to_ostream(std::cout);
        } else {
            converted.serialize_to_file(vg_out);
        }
    }

    if (!out_name.empty()) {
        // Open a destination file if it is a file we want to write to
        ofstream out_file;
        if (out_name != "-") {
            out_file.open(out_name);
        }
        // Work out where to save to
        ostream& out = (out_name == "-") ? std::cout : out_file;
        
        // Encapsulate output in VPKG
        stream::VPKG::with_save_stream(out, "XG", [&](ostream& tagged) {
            // Serialize to the file while recording space usage to the structure.
            graph->serialize(tagged, structure.get(), "xg");
        });
        
        out.flush();
    }

    if (!report_name.empty()) {
        // Save the report
        ofstream out;
        out.open(report_name.c_str());
        sdsl::write_structure_tree<HTML_FORMAT>(structure.get(), out, 0);
    }

    // queries
    if (node_sequence) {
        cout << node_id << ": " << graph->node_sequence(node_id) << endl;
    }
    if (!pos_for_char.empty()) {
        // extract the position from the string
        int64_t id;
        bool is_rev;
        size_t off;
        extract_pos(pos_for_char, id, is_rev, off);
        // then pick it up from the graph
        cout << graph->pos_char(id, is_rev, off) << endl;
    }
    if (!pos_for_substr.empty()) {
        int64_t id;
        bool is_rev;
        size_t off;
        size_t len;
        extract_pos_substr(pos_for_substr, id, is_rev, off, len);
        cout << graph->pos_substr(id, is_rev, off, len) << endl;
    }
    
    if (edges_from) {
        vector<Edge> edges = graph->edges_from(node_id);
        for (auto& edge : edges) {
            cout << edge.from() << (edge.from_start()?"-":"+")
                 << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl;
        }
    }
    if (edges_to) {
        vector<Edge> edges = graph->edges_to(node_id);
        for (auto& edge : edges) {
            cout << edge.from() << (edge.from_start()?"-":"+")
                 << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl;
        }
    }
    if (edges_of) {
        vector<Edge> edges = graph->edges_of(node_id);
        for (auto& edge : edges) {
            cout << edge.from() << (edge.from_start()?"-":"+")
                 << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl;
        }
    }
    if (edges_on_start) {
        vector<Edge> edges = graph->edges_on_start(node_id);
        for (auto& edge : edges) {
            cout << edge.from() << (edge.from_start()?"-":"+")
                 << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl;
        }
    }
    if (edges_on_end) {
        vector<Edge> edges = graph->edges_on_end(node_id);
        for (auto& edge : edges) {
            cout << edge.from() << (edge.from_start()?"-":"+")
                 << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl;
        }
    }

    if (node_context) {
        Graph g;
        graph->neighborhood(node_id, context_steps, g);
        if (text_output) {
            to_text(cout, g);
        } else {
            vector<Graph> gb = { g };
            stream::write_buffered(cout, gb, 0);
        }
    }

    if (!target.empty()) {
        string name;
        int64_t start, end;
        Graph g;
        parse_region(target, name, start, end);
        graph->get_path_range(name, start, end, g);
        graph->expand_context(g, context_steps);
        if (text_output) {
            to_text(cout, g);
        } else {
            vector<Graph> gb = { g };
            stream::write_buffered(cout, gb, 0);
        }
    }
    
    if (extract_threads) {
        list<XG::thread_t> threads;
        for (auto& p : graph->extract_threads(false)) {
            for (auto& t : p.second) {
                threads.push_back(t);
            }
        }
        for (auto& p : graph->extract_threads(true)) {
            for (auto& t : p.second) {
                threads.push_back(t);
            }
        }

        size_t thread_number = 0;
        for(XG::thread_t& thread : threads) {
            // Convert to a Path
            Path path;
            for(XG::ThreadMapping& m : thread) {
                // Convert all the mappings
                Mapping mapping;
                mapping.mutable_position()->set_node_id(m.node_id);
                mapping.mutable_position()->set_is_reverse(m.is_reverse);
                
                *(path.add_mapping()) = mapping;
            }
        
        
            // Give each thread a name
            path.set_name("_thread_" + to_string(thread_number++));
            
            // We need a Graph for serialization purposes. We do one chunk per
            // thread in case the threads are long.
            Graph g;
            
            *(g.add_path()) = path;
            
            // Dump the graph with its mappings. TODO: can we restrict these to
            // mappings to nodes we have already pulled out? Or pull out the
            // whole compressed graph?
            if (text_output) {
                to_text(cout, g);
            } else {
                vector<Graph> gb = { g };
                stream::write_buffered(cout, gb, 0);
            }
            
        }
    }

    if (!b_array_name.empty()) {
        // Dump B array
        ofstream out;
        out.open(b_array_name.c_str());
        graph->bs_dump(out);
    }

    return 0;
}
Ejemplo n.º 10
0
PathIndex::PathIndex(const list<Mapping>& mappings, VG& vg) {
    // Trace the given path in the given VG graph, collecting sequence
    
    // We're going to build the sequence string
    std::stringstream seq_stream;
    
    // What base are we at in the path?
    size_t path_base = 0;
    
    // What was the last rank? Ranks must always go up.
    int64_t last_rank = -1;
    
    for (auto& mapping : mappings) {
    
        if (!by_id.count(mapping.position().node_id())) {
            // This is the first time we have visited this node in the path.
            
            // Add in a mapping.
            by_id[mapping.position().node_id()] = 
                std::make_pair(path_base, mapping.position().is_reverse());
#ifdef debug
            #pragma omp critical (cerr)
            std::cerr << "Node " << mapping.position().node_id() << " rank " << mapping.rank()
                << " starts at base " << path_base << " with "
                << vg.get_node(mapping.position().node_id())->sequence() << std::endl;
#endif
            
            // Make sure ranks are monotonically increasing along the path, or
            // unset.
            assert(mapping.rank() > last_rank || (mapping.rank() == 0 && last_rank == 0));
            last_rank = mapping.rank();
        }
        
        // Say that this node appears here along the reference in this
        // orientation.
        by_start[path_base] = NodeSide(mapping.position().node_id(), mapping.position().is_reverse());
    
        // Remember that occurrence by node ID.
        node_occurrences[mapping.position().node_id()].push_back(by_start.find(path_base));
    
        // Say this Mapping happens at this base along the path
        mapping_positions[&mapping] = path_base;
    
        // Find the node's sequence
        std::string node_sequence = vg.get_node(mapping.position().node_id())->sequence();
    
        while(path_base == 0 && node_sequence.size() > 0 &&
            (node_sequence[0] != 'A' && node_sequence[0] != 'T' && node_sequence[0] != 'C' &&
            node_sequence[0] != 'G' && node_sequence[0] != 'N')) {
            
            // If the path leads with invalid characters (like "X"), throw them
            // out when computing path positions.
            
            // TODO: this is a hack to deal with the debruijn-brca1-k63 graph,
            // which leads with an X.
            #pragma omp critical (cerr)
            std::cerr << "Warning: dropping invalid leading character "
                << node_sequence[0] << " from node " << mapping.position().node_id()
                << std::endl;
                
            node_sequence.erase(node_sequence.begin());
        }
        
        if (mapping.position().is_reverse()) {
            // Put the reverse sequence in the path
            seq_stream << reverse_complement(node_sequence);
        } else {
            // Put the forward sequence in the path
            seq_stream << node_sequence;
        }
        
        // Whether we found the right place for this node in the reference or
        // not, we still need to advance along the reference path. We assume the
        // whole node (except any leading bogus characters) is included in the
        // path (since it sort of has to be, syntactically, unless it's the
        // first or last node).
        path_base += node_sequence.size();
        
        // TODO: handle leading bogus characters in calls on the first node.
    }
    
    // Record the length of the last mapping's node, since there's no next mapping to work it out from
    last_node_length = mappings.empty() ?
        0 : 
        vg.get_node(mappings.back().position().node_id())->sequence().size();
    
    // Create the actual reference sequence we will use
    sequence = seq_stream.str();
    
#ifdef debug
    // Announce progress.
    #pragma omp critical (cerr)
    std::cerr << "Traced " << path_base << " bp path." << std::endl;
    
    if (sequence.size() < 100) {
        #pragma omp critical (cerr)
        std::cerr << "Sequence: " << sequence << std::endl;
    }
#endif

    // Follow the path (again) and place all its Mappings
    
}
Ejemplo n.º 11
0
TEST(inequality, Point)
{
    CHECK(Point(1, 2) != Point(3, 4));
}
Ejemplo n.º 12
0
TEST(equality, Point)
{
    CHECK_EQUAL(Point(1, 2), Point(1, 2));
}
Ejemplo n.º 13
0
int main_validate(int argc, char** argv) {

    if (argc <= 2) {
        help_validate(argv);
        return 1;
    }

    bool check_nodes = false;
    bool check_edges = false;
    bool check_orphans = false;
    bool check_paths = false;

    int c;
    optind = 2; // force optind past command positional argument
    while (true) {
        static struct option long_options[] =
        {
            {"help", no_argument, 0, 'h'},
            {"nodes", no_argument, 0, 'n'},
            {"edges", no_argument, 0, 'e'},
            {"paths", no_argument, 0, 'o'},
            {"orphans", no_argument, 0, 'p'},
            {0, 0, 0, 0}
        };

        int option_index = 0;
        c = getopt_long (argc, argv, "hneop",
                long_options, &option_index);

        // Detect the end of the options.
        if (c == -1)
            break;

        switch (c)
        {

            case 'n':
                check_nodes = true;
                break;

            case 'e':
                check_edges = true;
                break;

            case 'o':
                check_orphans = true;
                break;

            case 'p':
                check_paths = true;
                break;

            case 'h':
            case '?':
                help_validate(argv);
                exit(1);
                break;

            default:
                abort ();
        }
    }

    VG* graph;
    get_input_file(optind, argc, argv, [&](istream& in) {
        graph = new VG(in);
    });

    // if we chose a specific subset, do just them
    if (check_nodes || check_edges || check_orphans || check_paths) {
        if (graph->is_valid(check_nodes, check_edges, check_orphans, check_paths)) {
            return 0;
        } else {
            return 1;
        }
        // otherwise do everything
    } else if (graph->is_valid()) {
        return 0;
    } else {
        return 1;
    }
}
Ejemplo n.º 14
0
int main_mod(int argc, char** argv) {

    if (argc == 2) {
        help_mod(argv);
        return 1;
    }

    string path_name;
    bool remove_orphans = false;
    string aln_file;
    string loci_file;
    bool called_genotypes_only = false;
    bool label_paths = false;
    bool compact_ids = false;
    bool prune_complex = false;
    int path_length = 0;
    int edge_max = 0;
    int chop_to = 0;
    bool add_start_and_end_markers = false;
    bool prune_subgraphs = false;
    bool kill_labels = false;
    bool simplify_graph = false;
    bool unchop = false;
    bool normalize_graph = false;
    bool remove_non_path = false;
    bool remove_path = false;
    bool compact_ranks = false;
    bool drop_paths = false;
    set<string> paths_to_retain;
    bool retain_complement = false;
    vector<int64_t> root_nodes;
    int32_t context_steps;
    bool remove_null;
    bool strong_connect = false;
    uint32_t unfold_to = 0;
    bool break_cycles = false;
    uint32_t dagify_steps = 0;
    uint32_t dagify_to = 0;
    uint32_t dagify_component_length_max = 0;
    bool orient_forward = false;
    int64_t destroy_node_id = 0;
    bool bluntify = false;
    int until_normal_iter = 0;
    string translation_file;
    bool flip_doubly_reversed_edges = false;
    bool cactus = false;
    string vcf_filename;
    string loci_filename;
    int max_degree = 0;

    int c;
    optind = 2; // force optind past command positional argument
    while (true) {
        static struct option long_options[] =

        {
            {"help", no_argument, 0, 'h'},
            {"include-aln", required_argument, 0, 'i'},
            {"include-loci", required_argument, 0, 'q'},
            {"include-gt", required_argument, 0, 'Q'},
            {"compact-ids", no_argument, 0, 'c'},
            {"compact-ranks", no_argument, 0, 'C'},
            {"drop-paths", no_argument, 0, 'D'},
            {"keep-path", required_argument, 0, 'k'},
            {"remove-orphans", no_argument, 0, 'o'},
            {"prune-complex", no_argument, 0, 'p'},
            {"prune-subgraphs", no_argument, 0, 'S'},
            {"length", required_argument, 0, 'l'},
            {"edge-max", required_argument, 0, 'e'},
            {"chop", required_argument, 0, 'X'},
            {"kill-labels", no_argument, 0, 'K'},
            {"markers", no_argument, 0, 'm'},
            {"threads", no_argument, 0, 't'},
            {"label-paths", no_argument, 0, 'P'},
            {"simplify", no_argument, 0, 's'},
            {"unchop", no_argument, 0, 'u'},
            {"normalize", no_argument, 0, 'n'},
            {"until-normal", required_argument, 0, 'U'},
            {"remove-non-path", no_argument, 0, 'N'},
            {"remove-path", no_argument, 0, 'A'},
            {"orient-forward", no_argument, 0, 'O'},
            {"unfold", required_argument, 0, 'f'},
            {"retain-path", required_argument, 0, 'r'},
            {"retain-complement", no_argument, 0, 'I'},
            {"subgraph", required_argument, 0, 'g'},
            {"context", required_argument, 0, 'x'},
            {"remove-null", no_argument, 0, 'R'},
            {"strong-connect", no_argument, 0, 'T'},
            {"dagify-steps", required_argument, 0, 'd'},
            {"dagify-to", required_argument, 0, 'w'},
            {"dagify-len-max", required_argument, 0, 'L'},
            {"bluntify", no_argument, 0, 'B'},
            {"break-cycles", no_argument, 0, 'b'},
            {"orient-forward", no_argument, 0, 'O'},
            {"destroy-node", required_argument, 0, 'y'},
            {"translation", required_argument, 0, 'Z'},
            {"unreverse-edges", required_argument, 0, 'E'},
            {"cactus", no_argument, 0, 'a'},
            {"sample-vcf", required_argument, 0, 'v'},
            {"sample-graph", required_argument, 0, 'G'},
            {"max-degree", required_argument, 0, 'M'},
            {0, 0, 0, 0}
        };

        int option_index = 0;
        c = getopt_long (argc, argv, "hk:oi:q:Q:cpl:e:mt:SX:KPsunzNAf:CDr:Ig:x:RTU:Bbd:Ow:L:y:Z:Eav:G:M:",
                long_options, &option_index);


        // Detect the end of the options.
        if (c == -1)
            break;

        switch (c)
        {

        case 'i':
            cerr << "[vg mod] warning: vg mod -i is deprecated and will soon be removed.  please switch to vg augment" << endl;
            aln_file = optarg;
            break;

        case 'q':
            cerr << "[vg mod] warning: vg mod -q is deprecated and will soon be removed.  please switch to vg augment -l" << endl;
            loci_file = optarg;
            break;

        case 'Q':
            cerr << "[vg mod] warning: vg mod -l is deprecated and will soon be removed.  please switch to vg augment -L" << endl;
            loci_file = optarg;
            called_genotypes_only = true;
            break;

        case 'Z':
            cerr << "[vg mod] warning: vg mod -Z is deprecated and will soon be removed.  please switch to vg augment -Z" << endl;
            translation_file = optarg;
            break;

        case 'c':
            compact_ids = true;
            break;

        case 'C':
            compact_ranks = true;
            break;

        case 'k':
            path_name = optarg;
            break;

        case 'r':
            paths_to_retain.insert(optarg);
            break;
            
        case 'I':
            retain_complement = true;
            break;

        case 'o':
            remove_orphans = true;
            break;

        case 'p':
            prune_complex = true;
            break;

        case 'S':
            prune_subgraphs = true;
            break;

        case 'l':
            path_length = parse<int>(optarg);
            break;

        case 'X':
            chop_to = parse<int>(optarg);
            break;

        case 'u':
            unchop = true;
            break;

        case 'E':
            flip_doubly_reversed_edges = true;
            break;

        case 'K':
            kill_labels = true;
            break;

        case 'e':
            edge_max = parse<int>(optarg);
            break;

        case 'm':
            add_start_and_end_markers = true;
            break;

        case 't':
            omp_set_num_threads(parse<int>(optarg));
            break;

        case 'f':
            unfold_to = parse<int>(optarg);
            break;

        case 'O':
            orient_forward = true;
            break;

        case 'P':
            cerr << "[vg mod] warning: vg mod -P is deprecated and will soon be removed.  please switch to vg augment -B" << endl;
            label_paths = true;
            break;

        case 'D':
            drop_paths = true;
            break;

        case 's':
            simplify_graph = true;
            break;

        case 'n':
            normalize_graph = true;
            break;

        case 'N':
            remove_non_path = true;
            break;
            
        case 'A':
            remove_path = true;
            break;

        case 'T':
            strong_connect = true;
            break;

        case 'U':
            until_normal_iter = parse<int>(optarg);
            break;

        case 'd':
            dagify_steps = parse<int>(optarg);
            break;

        case 'w':
            dagify_to = parse<int>(optarg);
            break;

        case 'L':
            dagify_component_length_max = parse<int>(optarg);
            break;

        case 'B':
            bluntify = true;
            break;

        case 'b':
            break_cycles = true;
            break;

        case 'g':
            root_nodes.push_back(parse<int>(optarg));
            break;

        case 'x':
            context_steps = parse<int>(optarg);
            break;

        case 'R':
            remove_null = true;
            break;

        case 'y':
            destroy_node_id = parse<int>(optarg);
            break;

        case 'a':
            cactus = true;
            break;

        case 'v':
            vcf_filename = optarg;
            break;
            
        case 'G':
            loci_filename = optarg;
            break;

        case 'M':
            max_degree = parse<int>(optarg);
            break;

        case 'h':
        case '?':
            help_mod(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    VG* graph;
    get_input_file(optind, argc, argv, [&](istream& in) {
        graph = new VG(in);
    });
    
    if (retain_complement) {
        // Compute the actual paths to retain
        set<string> complement;
        graph->paths.for_each_name([&](const string& name) {
            if (!paths_to_retain.count(name)) {
                // Complement the set the user specified by putting in all the
                // paths they didn't mention.
                complement.insert(name);
            }
        });
        
        // Retain the complement of what we were asking for.
        paths_to_retain = complement;
    }

    if (!vcf_filename.empty()) {
        // We need to throw out the parts of the graph that are on alt paths,
        // but not on alt paths for alts used by the first sample in the VCF.

        // This is called with the entire path name string to detect alt
        // paths.
        const function<bool(const string&)>& is_alt = Paths::is_alt;

        // This holds the VCF file we read the variants from. It needs to be the
        // same one used to construct the graph.
        vcflib::VariantCallFile variant_file;
        variant_file.open(vcf_filename);
        if (!variant_file.is_open()) {
            cerr << "error:[vg mod] could not open" << vcf_filename << endl;
            return 1;
        }

        // Now go through and prune down the varaints.

        // How many phases are there?
        size_t num_samples = variant_file.sampleNames.size();
        // TODO: we can only handle single-sample VCFs
        assert(num_samples == 1);

        // This will hold the IDs of all nodes visited by alt paths that aren't used.
        set<vg::id_t> alt_path_ids;

        graph->paths.for_each_name([&](const string& alt_path_name) {
            // For every path name in the graph

            if(is_alt(alt_path_name)) {
                // If it's an alt path

                for(auto& mapping : graph->paths.get_path(alt_path_name)) {
                    // Mark all nodes that are part of it as on alt paths
                    alt_path_ids.insert(mapping.node_id());
                }

            }
        });

        // We also have a function to handle each variant as it comes in.
        auto handle_variant = [&](vcflib::Variant& variant) {
            // So we have a variant

            if(variant.alleles.size() < 2) {
                // Skip non-variable variants.
                return;
            }

            // Grab its id, or make one by hashing stuff if it doesn't
            // have an ID.
            string var_name = make_variant_id(variant);

            if(!graph->paths.has_path("_alt_" + var_name + "_0")) {
                // There isn't a reference alt path for this variant. Someone messed up.
                cerr << variant << endl;
                throw runtime_error("Reference alt for " + var_name + " not in graph!");
            }

            // For now always work on sample 0. TODO: let the user specify a
            // name and find it.
            int sample_number = 0;

            // What sample is it?
            string& sample_name = variant_file.sampleNames[sample_number];

            // Parse it out and see if it's phased.
            string genotype = variant.getGenotype(sample_name);

            // Tokenize into allele numbers
            // The token iterator can't hold the regex
            regex allele_separator("[|/]");
            for (sregex_token_iterator it(genotype.begin(), genotype.end(), allele_separator, -1);
                it != sregex_token_iterator(); ++it) {
                // For every token separated by / or |
                int allele_number;
                if(it->str() == ".") {
                    // Unknown; pretend it's ref for the purposes of making a
                    // sample graph.
                    allele_number = 0;
                } else {
                    // Parse the allele number
                    allele_number = stoi(it->str());
                }



                // Make the name for its alt path
                string alt_path_name = "_alt_" + var_name + "_" + to_string(allele_number);

                for(auto& mapping : graph->paths.get_path(alt_path_name)) {
                    // Un-mark all nodes that are on this alt path, since it is used by the sample.
                    alt_path_ids.erase(mapping.node_id());
                }
            }

        };


        // Allocate a place to store actual variants
        vcflib::Variant var(variant_file);

        while (variant_file.is_open() && variant_file.getNextVariant(var)) {
            // this ... maybe we should remove it as for when we have calls against N
            bool isDNA = allATGC(var.ref);
            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                if (!allATGC(*a)) isDNA = false;
            }
            // only work with DNA sequences
            if (!isDNA) {
                continue;
            }

            // Handle the variant
            handle_variant(var);
        }


        for(auto& node_id : alt_path_ids) {
            // And delete all the nodes that were used by alt paths that weren't
            // in the genotype of the first sample.

            for(auto& path_name : graph->paths.of_node(node_id)) {
                // For every path that touches the node we're destroying,
                // destroy the path. We can't leave it because it won't be the
                // same path without this node.
                graph->paths.remove_path(path_name);
#ifdef debug
                cerr << "Node " << node_id << " was on path " << path_name << endl;
#endif
            }

            // Actually get rid of the node once its paths are gone.
            graph->destroy_node(node_id);
        }

    }
    
    if (!loci_filename.empty()) {
        // Open the file
        ifstream loci_file(loci_filename);
        assert(loci_file.is_open());
    
        // What nodes and edges are called as present by the loci?
        set<Node*> called_nodes;
        set<Edge*> called_edges;
    
        function<void(Locus&)> lambda = [&](Locus& locus) {
            // For each locus
            
            if (locus.genotype_size() == 0) {
                // No call made here. Just remove all the nodes/edges. TODO:
                // should we keep them all if we don't know if they're there or
                // not? Or should the caller call ref with some low confidence?
                return;
            }
            
            const Genotype& gt = locus.genotype(0);
            
            for (size_t j = 0; j < gt.allele_size(); j++) {
                // For every allele called as present
                int allele_number = gt.allele(j);
                const Path& allele = locus.allele(allele_number);
                
                for (size_t i = 0; i < allele.mapping_size(); i++) {
                    // For every Mapping in the allele
                    const Mapping& m = allele.mapping(i);
                    
                    // Remember to keep this node
                    called_nodes.insert(graph->get_node(m.position().node_id()));
                    
                    if (i + 1 < allele.mapping_size()) {
                        // Look at the next mapping, which exists
                        const Mapping& m2 = allele.mapping(i + 1);
                        
                        // Find the edge from the last Mapping's node to this one and mark it as used
                        called_edges.insert(graph->get_edge(NodeSide(m.position().node_id(), !m.position().is_reverse()),
                            NodeSide(m2.position().node_id(), m2.position().is_reverse())));
                    }
                }
            }
        };
        vg::io::for_each(loci_file, lambda);
        
        // Collect all the unused nodes and edges (so we don't try to delete
        // while iterating...)
        set<Node*> unused_nodes;
        set<Edge*> unused_edges;
        
        graph->for_each_node([&](Node* n) {
            if (!called_nodes.count(n)) {
                unused_nodes.insert(n);
            }
        });
        
        graph->for_each_edge([&](Edge* e) {
            if (!called_edges.count(e)) {
                unused_edges.insert(e);
            }
        });
        
        // Destroy all the extra edges (in case they use extra nodes)
        for (auto* e : unused_edges) {
            graph->destroy_edge(e);
        }
        
        for (auto* n : unused_nodes) {
            graph->destroy_node(n);
        }
    }

    if (bluntify) {
        graph->bluntify();
    }

    if (!path_name.empty()) {
        graph->keep_path(path_name);
    }

    if (!paths_to_retain.empty() || retain_complement) {
        graph->paths.keep_paths(paths_to_retain);
    }

    if (drop_paths) {
        graph->paths.clear();
    }

    if (remove_orphans) {
        graph->remove_orphan_edges();
    }

    if (unchop) {
        graph->unchop();
    }

    if (simplify_graph) {
        graph->simplify_siblings();
    }

    if (normalize_graph) {
        graph->normalize();
    }

    if (until_normal_iter) {
        graph->normalize(until_normal_iter);
    }

    if (strong_connect) {
        graph->keep_multinode_strongly_connected_components();
    }

    if (remove_non_path) {
        graph->remove_non_path();
    }
    
    if (remove_path) {
        graph->remove_path();
    }

    if (orient_forward) {
        algorithms::orient_nodes_forward(graph);
    }

    if (flip_doubly_reversed_edges) {
        graph->flip_doubly_reversed_edges();
    }

    if (dagify_steps) {
        unordered_map<int64_t, pair<int64_t, bool> > node_translation;
        *graph = graph->dagify(dagify_steps, node_translation, 0, dagify_component_length_max);
    }

    if (dagify_to) {
        unordered_map<int64_t, pair<int64_t, bool> > node_translation;
        // use the walk as our maximum number of steps; it's the worst case
        *graph = graph->dagify(dagify_to, node_translation, dagify_to, dagify_component_length_max);
    }

    if (unfold_to) {
        unordered_map<int64_t, pair<int64_t, bool> > node_translation;
        *graph = graph->unfold(unfold_to, node_translation);
    }

    if (remove_null) {
        graph->remove_null_nodes_forwarding_edges();
    }

    if (break_cycles) {
        graph->break_cycles();
    }

    // to subset the graph
    if (!root_nodes.empty()) {
        VG g;
        for (auto root : root_nodes) {
            graph->nonoverlapping_node_context_without_paths(graph->get_node(root), g);
            graph->expand_context(g, max(context_steps, 1));
            g.remove_orphan_edges();
        }
        *graph = g;
    }

    if (!aln_file.empty()) {
        // read in the alignments and save their paths, concatenating them in order where they have the same name
        map<string, Path> paths_map;
        function<void(Alignment&)> lambda = [&graph, &paths_map](Alignment& aln) {
            Path path = simplify(aln.path());
            path.set_name(aln.name());
            auto f = paths_map.find(path.name());
            if (f != paths_map.end()) {
                paths_map[path.name()] = concat_paths(f->second, path);
            } else {
                paths_map[path.name()] = path;
            }
        };
        if (aln_file == "-") {
            vg::io::for_each(std::cin, lambda);
        } else {
            ifstream in;
            in.open(aln_file.c_str());
            vg::io::for_each(in, lambda);
        }
        vector<Path> paths;
        for (auto& p : paths_map) {
            paths.push_back(p.second);
        }
        paths_map.clear();
        if (!label_paths) {
            // execute the edits
            auto translation = graph->edit(paths, true);
            if (!translation_file.empty()) {
                ofstream out(translation_file);
                vg::io::write_buffered(out, translation, 0);
                out.close();
            }
        } else {
            // just add the path labels to the graph
            graph->paths.extend(paths);
        }
    }

    if (!loci_file.empty()) {
        // read in the alignments and save their paths
        vector<Path> paths;
        function<void(Locus&)> lambda = [&graph, &paths, &called_genotypes_only](Locus& locus) {
            // if we are only doing called genotypes, record so we can filter alleles
            set<int> alleles_in_genotype;
            if (called_genotypes_only) {
                for (int i = 0; i < locus.genotype_size(); ++i) {
                    for (int j = 0; j < locus.genotype(i).allele_size(); ++j) {
                        alleles_in_genotype.insert(locus.genotype(i).allele(j));
                    }
                }
            }
            for (int i = 0; i < locus.allele_size(); ++i) {
                // skip alleles not in the genotype if using only called genotypes
                if (!alleles_in_genotype.empty()) {
                    if (!alleles_in_genotype.count(i)) continue;
                }
                Path path = simplify(locus.allele(i));
                stringstream name;
                name << locus.name() << ":" << i;
                path.set_name(name.str());
                paths.push_back(path);
            }
        };
        if (loci_file == "-") {
            vg::io::for_each(std::cin, lambda);
        } else {
            ifstream in;
            in.open(loci_file.c_str());
            vg::io::for_each(in, lambda);
        }
        // execute the edits and produce the translation if requested.
        // Make sure to break at node ends, but don't add any paths because they're just loci alleles and not real paths.
        auto translation = graph->edit(paths, false, false, true);
        if (!translation_file.empty()) {
            ofstream out(translation_file);
            vg::io::write_buffered(out, translation, 0);
            out.close();
        }
    }

    // and optionally compact ids
    if (compact_ids) {
        graph->sort();
        graph->compact_ids();
    }

    if (compact_ranks) {
        graph->paths.compact_ranks();
    }

    if (prune_complex) {
        if (!(path_length > 0 && edge_max > 0)) {
            cerr << "[vg mod]: when pruning complex regions you must specify a --path-length and --edge-max" << endl;
            return 1;
        }
        graph->prune_complex_with_head_tail(path_length, edge_max);
    }

    if (max_degree) {
        algorithms::remove_high_degree_nodes(*graph, max_degree);
    }

    if (prune_subgraphs) {
        graph->prune_short_subgraphs(path_length);
    }

    if (chop_to) {
        graph->dice_nodes(chop_to);
        graph->paths.compact_ranks();
    }

    if (kill_labels) {
        graph->for_each_node([](Node* n) { n->clear_sequence(); });
    }

    if (add_start_and_end_markers) {
        if (!(path_length > 0)) {
            cerr << "[vg mod]: when adding start and end markers you must provide a --path-length" << endl;
            return 1;
        }
        // TODO: replace this with the SourceSinkOverlay somehow?
        Node* head_node = NULL;
        Node* tail_node = NULL;
        vg::id_t head_id = 0, tail_id = 0;
        graph->add_start_end_markers(path_length, '#', '$', head_node, tail_node, head_id, tail_id);
    }

    if (destroy_node_id > 0) {
        graph->destroy_node(destroy_node_id);
    }

    if (cactus) {
        // ensure we're sorted
        graph->sort();
        *graph = cactusify(*graph);
        // no paths survive, make sure they are erased
        graph->paths = Paths();
    }

    graph->serialize_to_ostream(std::cout);

    delete graph;

    return 0;
}
Ejemplo n.º 15
0
CactusSiteFinder::CactusSiteFinder(VG& graph, const string& hint_path_name): graph(graph), hint_path_name(hint_path_name) {
    // Make sure the graph is sorted.
    // cactus needs the nodes to be sorted in order to find a source and sink.
    graph.sort();
}
Ejemplo n.º 16
0
void renderer::draw_objects(VG& v) {
    VG::iterator it;
    for (it = v.begin(); it != v.end(); ++it) {
        draw(*it);
    }
}
Ejemplo n.º 17
0
int main_augment(int argc, char** argv) {

    // augmentation mode
    string augmentation_mode = "direct";
    
    // load pileupes from here
    string pileup_file_name;

    // minimum support to consider adding a variant to the graph
    int min_aug_support = PileupAugmenter::Default_min_aug_support;
        
    // Should we expect a subgraph and ignore pileups for missing nodes/edges?
    bool expect_subgraph = false;

    // Write the translations (as protobuf) to this path
    string translation_file_name;

    // Include a path in the graph for each GAM
    bool include_paths = false;

    // Just label the paths with the GAM
    bool label_paths = false;

    // Merge alleles from this loci file instead of GAM
    string loci_filename;

    // Merge only alleles from called genotypes in the loci file
    bool called_genotypes_only = false;

    // Write the supports (as protobuf) to this path
    string support_file_name;
    
    // Load in GAM alignments to map over to the augmented graph from here
    string gam_in_file_name;

    // Write the GAM alignments (from gam_in_file_name) projected on the augmented graph here
    string gam_out_file_name;

    // Print some progress messages to screen
    bool show_progress = false;

    // Print verbose message
    bool verbose = false;

    // Number of threads to use (will default to all if not specified)
    int thread_count = 0;

    // Bases wit quality less than 10 will not be added to the pileup
    int min_quality = 10;

    // Bases with more than this many mismatches within the window_size not added
    int max_mismatches = 1;

    // Window size for above (0 effectively turns this check off)
    int window_size = 0;

    // Hack to prevent protobuf messages from getting too big by limiting depth at
    // any given position to max_depth
    int max_depth = 1000;
    
    // Combine MAPQ and PHRED base qualities to determine quality at each position
    // If false, only PHRED base quality will be used. 
    bool use_mapq = true;


    static const struct option long_options[] = {
        // General Options
        {"augmentation-mode", required_argument, 0, 'a'},
        {"translation", required_argument, 0, 'Z'},
        {"alignment-out", required_argument, 0, 'A'},
        {"include-paths", no_argument, 0, 'i'},
        {"label-paths", no_argument, 0, 'B'},
        {"help", no_argument, 0, 'h'},
        {"progress", required_argument, 0, 'p'},
        {"verbose", no_argument, 0, 'v'},
        {"threads", required_argument, 0, 't'},
        // Loci Options
        {"include-loci", required_argument, 0, 'l'},
        {"include-gt", required_argument, 0, 'L'},
        // Pileup Options
        {"pileup", required_argument, 0, 'P'},        
        {"support", required_argument, 0, 'S'},
        {"min-quality", required_argument, 0, 'q'},
        {"max-mismatches", required_argument, 0, 'm'},
        {"window-size", required_argument, 0, 'w'},
        {"ignore-mapq", no_argument, 0, 'M'},
        {"min-aug-support", required_argument, 0, 'g'},
        {"subgraph", no_argument, 0, 'U'},
        {0, 0, 0, 0}
    };
    static const char* short_options = "a:Z:A:iBhpvt:l:L:P:S:q:m:w:Mg:U";
    optind = 2; // force optind past command positional arguments

    // This is our command-line parser
    ConfigurableParser parser(short_options, long_options, [&](int c) {
        // Parse all the options we have defined here.
        switch (c)
        {
            // General Options
        case 'a':
            augmentation_mode = optarg;
            break;
        case 'Z':
            translation_file_name = optarg;
            break;
        case 'A':
            gam_out_file_name = optarg;
            break;
        case 'i':
            include_paths = true;
            break;
        case 'B':
            label_paths = true;
            break;
        case 'h':
        case '?':
            /* getopt_long already printed an error message. */
            help_augment(argv, parser);
            exit(1);
            break;
        case 'p':
            show_progress = true;
            break;
        case 'v':
            verbose = true;
            break;            
        case 't':
            thread_count = parse<int>(optarg);
            break;

            // Loci Options
        case 'l':
            loci_filename = optarg;
            break;
        case 'L':
            loci_filename = optarg;
            called_genotypes_only = true;
            break;
            
            // Pileup Options
        case 'P':
            pileup_file_name = optarg;
            break;
        case 'S':
            support_file_name = optarg;
            break;            
        case 'q':
            min_quality = parse<int>(optarg);
            break;
        case 'm':
            max_mismatches = parse<int>(optarg);
            break;
        case 'w':
            window_size = parse<int>(optarg);
            break;
        case 'M':
            use_mapq = false;
            break;            
        case 'g':
            min_aug_support = parse<int>(optarg);
            break;            
        case 'U':
            expect_subgraph = true;
            break;
            
        default:
          abort ();
        }
    });

    // Parse the command line options, updating optind.
    parser.parse(argc, argv);

    if (thread_count != 0) {
        // Use a non-default number of threads
        omp_set_num_threads(thread_count);
    }
    thread_count = get_thread_count();

    // Parse the two positional arguments
    if (optind + 1 > argc) {
        cerr << "[vg augment] error: too few arguments" << endl;
        help_augment(argv, parser);
        return 1;
    }

    string graph_file_name = get_input_file_name(optind, argc, argv);
    if (optind < argc) {
        gam_in_file_name = get_input_file_name(optind, argc, argv);
    }

    if (gam_in_file_name.empty() && loci_filename.empty()) {
        cerr << "[vg augment] error: gam file argument required" << endl;
        return 1;
    }
    if (gam_in_file_name == "-" && graph_file_name == "-") {
        cerr << "[vg augment] error: graph and gam can't both be from stdin." << endl;
        return 1;
    }
    if (gam_in_file_name == "-" && !gam_out_file_name.empty()) {
        cerr << "[vg augment] error: cannot stream input gam when using -A option (as it requires 2 passes)" << endl;
        return 1;
    }

    if (augmentation_mode != "pileup" && augmentation_mode != "direct") {
        cerr << "[vg augment] error: pileup and direct are currently the only supported augmentation modes (-a)" << endl;
        return 1;
    }

    if (augmentation_mode != "direct" and !gam_out_file_name.empty()) {
        cerr << "[vg augment] error: GAM output only works with \"direct\" augmentation mode" << endl;
        return 1;
    }

    if (augmentation_mode != "pileup" and (!support_file_name.empty() || !pileup_file_name.empty())) {
        cerr << "[vg augment] error: Pileup (-P) and Support (-S) output only work with  \"pileup\" augmentation mode" << endl;
        return 1;
    }

    if (label_paths && (!gam_out_file_name.empty() || !translation_file_name.empty())) {
        cerr << "[vg augment] error: Translation (-Z) and GAM (-A) output do not work with \"label-only\" (-B) mode" << endl;
        return 1;
    }
    
    // read the graph
    if (show_progress) {
        cerr << "Reading input graph" << endl;
    }
    VG* graph;
    get_input_file(graph_file_name, [&](istream& in) {
        graph = new VG(in);
    });
    
    
    Pileups* pileups = nullptr;
    
    if (!pileup_file_name.empty() || augmentation_mode == "pileup") {
        // We will need the computed pileups
        
        // compute the pileups from the graph and gam
        pileups = compute_pileups(graph, gam_in_file_name, thread_count, min_quality, max_mismatches,
                                  window_size, max_depth, use_mapq, show_progress);
    }
        
    if (!pileup_file_name.empty()) {
        // We want to write out pileups.
        if (show_progress) {
            cerr << "Writing pileups" << endl;
        }
        ofstream pileup_file(pileup_file_name);
        if (!pileup_file) {
            cerr << "[vg augment] error: unable to open output pileup file: " << pileup_file_name << endl;
            exit(1);
        }
        pileups->write(pileup_file);
    }

    if (augmentation_mode == "direct" && !gam_in_file_name.empty()) {
        // Augment with the reads
        
        if (!support_file_name.empty()) {
            cerr << "[vg augment] error: support calculation in direct augmentation mode is unimplemented" << endl;
            exit(1);
        }
        
        // We don't need any pileups
        if (pileups != nullptr) {
            delete pileups;
            pileups = nullptr;
        }
    
        // Load all the reads
        vector<Alignment> reads;
        // And pull out their paths
        vector<Path> read_paths;

        if (include_paths) {
            // verbatim from vg mod -i
            map<string, Path> paths_map;
            function<void(Alignment&)> lambda = [&](Alignment& aln) {
                Path path = simplify(aln.path());
                path.set_name(aln.name());
                auto f = paths_map.find(path.name());
                if (f != paths_map.end()) {
                    paths_map[path.name()] = concat_paths(f->second, path);
                } else {
                    paths_map[path.name()] = path;
                }
                if (!gam_out_file_name.empty()) {
                    reads.push_back(aln);
                }
            };
            if (gam_in_file_name == "-") {
                stream::for_each(std::cin, lambda);
            } else {
                ifstream in;
                in.open(gam_in_file_name.c_str());
                stream::for_each(in, lambda);
            }
            for (auto& p : paths_map) {
                read_paths.push_back(p.second);
            }
            paths_map.clear();
        }
        else {
            get_input_file(gam_in_file_name, [&](istream& alignment_stream) {
                    stream::for_each<Alignment>(alignment_stream, [&](Alignment& alignment) {
                            // Trim the softclips off of every read
                            // Work out were to cut
                            int cut_start = softclip_start(alignment);
                            int cut_end = softclip_end(alignment);
                            // Cut the sequence and quality
                            alignment.set_sequence(alignment.sequence().substr(cut_start, alignment.sequence().size() - cut_start - cut_end));
                            if (alignment.quality().size() != 0) {
                                alignment.set_quality(alignment.quality().substr(cut_start, alignment.quality().size() - cut_start - cut_end));
                            }
                            // Trim the path
                            *alignment.mutable_path() = trim_hanging_ends(alignment.path());
                
                            // Save every read
                            if (!gam_out_file_name.empty()) {
                                reads.push_back(alignment);
                            }
                            // And the path for the read, separately
                            // TODO: Make edit use callbacks or something so it doesn't need a vector of paths necessarily
                            read_paths.push_back(alignment.path());
                        });
                });
        }
        
        // Augment the graph, rewriting the paths.
        vector<Translation> translation;
        if (!label_paths) {
            translation = graph->edit(read_paths, include_paths, !gam_out_file_name.empty(), false);
        } else {
            // just add the path labels to the graph
            graph->paths.extend(read_paths);
        }
        
        // Write the augmented graph
        if (show_progress) {
            cerr << "Writing augmented graph" << endl;
        }
        graph->serialize_to_ostream(cout);
        
        if (!translation_file_name.empty()) {
            // Write the translations
            if (show_progress) {
                cerr << "Writing translation table" << endl;
            }
            ofstream translation_file(translation_file_name);
            if (!translation_file) {
                cerr << "[vg augment]: Error opening translation file: " << translation_file_name << endl;
                return 1;
            }
            stream::write_buffered(translation_file, translation, 0);
            translation_file.close();
        }        
        
        if (!gam_out_file_name.empty() && reads.size() == read_paths.size()) {
            // Write out the modified GAM
            
            ofstream gam_out_file(gam_out_file_name);
            if (!gam_out_file) {
                cerr << "[vg augment]: Error opening output GAM file: " << gam_out_file_name << endl;
                return 1;
            }
            
            // We use this buffer and do a buffered write
            vector<Alignment> gam_buffer;
            for (size_t i = 0; i < reads.size(); i++) {
                // Say we are going to write out the alignment
                gam_buffer.push_back(reads[i]);
                
                // Set its path to the corrected embedded path
                *gam_buffer.back().mutable_path() = read_paths[i];
                
                // Write it back out
                stream::write_buffered(gam_out_file, gam_buffer, 100);
            }
            // Flush the buffer
            stream::write_buffered(gam_out_file, gam_buffer, 0);
        }
    } else if (augmentation_mode == "pileup") {
        // We want to augment with pileups
        
        // The PileupAugmenter object will take care of all augmentation
        PileupAugmenter augmenter(graph, PileupAugmenter::Default_default_quality, min_aug_support);    

        // compute the augmented graph from the pileup
        // Note: we can save a fair bit of memory by clearing pileups, and re-reading off of
        //       pileup_file_name
        augment_with_pileups(augmenter, *pileups, expect_subgraph, show_progress);
        delete pileups;
        pileups = nullptr;

        // write the augmented graph
        if (show_progress) {
            cerr << "Writing augmented graph" << endl;
        }
        augmenter.write_augmented_graph(cout, false);

        // write the agumented gam
        if (!gam_out_file_name.empty()) {
            ofstream gam_out_file(gam_out_file_name);
            if (!gam_out_file) {
                cerr << "[vg augment]: Error opening output GAM file: " << gam_out_file_name << endl;
                return 1;
            }
            get_input_file(gam_in_file_name, [&](istream& alignment_stream) {
                    vector<Alignment> gam_buffer;
                    function<void(Alignment&)> lambda = [&gam_out_file, &gam_buffer, &augmenter](Alignment& alignment) {
                        list<mapping_t> aug_path;
                        augmenter.map_path(alignment.path(), aug_path, true);
                        alignment.mutable_path()->clear_mapping();
                        for (auto& aug_mapping : aug_path) {
                            *alignment.mutable_path()->add_mapping() = aug_mapping.to_mapping();
                        }
                        gam_buffer.push_back(alignment);
                        stream::write_buffered(gam_out_file, gam_buffer, 100);
                    };
                    stream::for_each(alignment_stream, lambda);
                    stream::write_buffered(gam_out_file, gam_buffer, 0);
                });
        }

        // write the translation
        if (!translation_file_name.empty()) {
            // write the translations
            if (show_progress) {
                cerr << "Writing translation table" << endl;
            }
            ofstream translation_file(translation_file_name);
            if (!translation_file) {
                cerr << "[vg augment] error: error opening translation file: " << translation_file_name << endl;
                return 1;
            }
            augmenter._augmented_graph.write_translations(translation_file);
            translation_file.close();
        }

        // write the supports
        if (!support_file_name.empty()) {
            // write the supports
            if (show_progress) {
                cerr << "Writing supports" << endl;
            }
            ofstream support_file(support_file_name);
            if (!support_file) {
                cerr << "[vg augment] error: error opening supports file: " << support_file_name << endl;
                return 1;
            }
            augmenter._augmented_graph.write_supports(support_file);
            support_file.close();
        }       
    } else if (!loci_filename.empty()) {
        // Open the file
        ifstream loci_file(loci_filename);
        assert(loci_file.is_open());
    
        // What nodes and edges are called as present by the loci?
        set<Node*> called_nodes;
        set<Edge*> called_edges;
    
        function<void(Locus&)> lambda = [&](Locus& locus) {
            // For each locus
            
            if (locus.genotype_size() == 0) {
                // No call made here. Just remove all the nodes/edges. TODO:
                // should we keep them all if we don't know if they're there or
                // not? Or should the caller call ref with some low confidence?
                return;
            }
            
            const Genotype& gt = locus.genotype(0);
            
            for (size_t j = 0; j < gt.allele_size(); j++) {
                // For every allele called as present
                int allele_number = gt.allele(j);
                const Path& allele = locus.allele(allele_number);
                
                for (size_t i = 0; i < allele.mapping_size(); i++) {
                    // For every Mapping in the allele
                    const Mapping& m = allele.mapping(i);
                    
                    // Remember to keep this node
                    called_nodes.insert(graph->get_node(m.position().node_id()));
                    
                    if (i + 1 < allele.mapping_size()) {
                        // Look at the next mapping, which exists
                        const Mapping& m2 = allele.mapping(i + 1);
                        
                        // Find the edge from the last Mapping's node to this one and mark it as used
                        called_edges.insert(graph->get_edge(NodeSide(m.position().node_id(), !m.position().is_reverse()),
                            NodeSide(m2.position().node_id(), m2.position().is_reverse())));
                    }
                }
            }
        };
        stream::for_each(loci_file, lambda);
        
        // Collect all the unused nodes and edges (so we don't try to delete
        // while iterating...)
        set<Node*> unused_nodes;
        set<Edge*> unused_edges;
        
        graph->for_each_node([&](Node* n) {
            if (!called_nodes.count(n)) {
                unused_nodes.insert(n);
            }
        });
        
        graph->for_each_edge([&](Edge* e) {
            if (!called_edges.count(e)) {
                unused_edges.insert(e);
            }
        });
        
        // Destroy all the extra edges (in case they use extra nodes)
        for (auto* e : unused_edges) {
            graph->destroy_edge(e);
        }
        
        for (auto* n : unused_nodes) {
            graph->destroy_node(n);
        }
    }

    if (pileups != nullptr) {
        delete pileups;
        pileups = nullptr;
    }    
    
    delete graph;

    return 0;
}