void Pileups::compute_from_alignment(VG& graph, Alignment& alignment) { // if we start reversed if (alignment.has_path() && alignment.path().mapping(0).position().is_reverse()) { alignment = reverse_alignment(alignment, (function<int64_t(int64_t)>) ([&graph](int64_t id) { return graph.get_node(id)->sequence().size(); })); } const Path& path = alignment.path(); int64_t read_offset = 0; for (int i = 0; i < path.mapping_size(); ++i) { const Mapping& mapping = path.mapping(i); if (graph.has_node(mapping.position().node_id())) { const Node* node = graph.get_node(mapping.position().node_id()); NodePileup* pileup = get_create(node->id()); int64_t node_offset = mapping.position().offset(); for (int j = 0; j < mapping.edit_size(); ++j) { const Edit& edit = mapping.edit(j); // process all pileups in edit. // update the offsets as we go compute_from_edit(*pileup, node_offset, read_offset, *node, alignment, mapping, edit); } } } assert(alignment.sequence().empty() || alignment.path().mapping_size() == 0 || read_offset == alignment.sequence().length()); }
int main_concat(int argc, char** argv) { if (argc == 2) { help_concat(argv); return 1; } int c; optind = 2; // force optind past command positional argument while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "h", long_options, &option_index); // Detect the end of the options. if (c == -1) break; switch (c) { case 'h': case '?': help_concat(argv); exit(1); break; default: abort (); } } list<VG*> graphs; while (optind < argc) { VG* graph; get_input_file(optind, argc, argv, [&](istream& in) { graph = new VG(in); }); graphs.push_back(graph); } VG merged; for (list<VG*>::iterator g = graphs.begin(); g != graphs.end(); ++g) { merged.append(**g); } // output merged.serialize_to_ostream(std::cout); return 0; }
string Sampler::alignment_seq(const Alignment& aln) { // get the graph corresponding to the alignment path Graph sub; for (int i = 0; i < aln.path().mapping_size(); ++ i) { auto& m = aln.path().mapping(i); if (m.has_position() && m.position().node_id()) { auto id = aln.path().mapping(i).position().node_id(); xgidx->neighborhood(id, 2, sub); } } VG g; g.extend(sub); return g.path_string(aln.path()); }
void VGset::transform(std::function<void(VG*)> lambda) { for (auto& name : filenames) { // load VG* g = NULL; if (name == "-") { g = new VG(std::cin, show_progress); } else { ifstream in(name.c_str()); g = new VG(in, show_progress); in.close(); } g->name = name; // apply lambda(g); // write to the same file ofstream out(name.c_str()); g->serialize_to_ostream(out); out.close(); delete g; } }
VG handle_to_vg(const HandleGraph* xg) { // If xg is a null pointer, throw a runtime error if (xg == nullptr) { throw runtime_error("There is no xg to convert"); } // Initialize the VG graph VG vg; // Iterate through each handle in xg and create the same handle in vg xg->for_each_handle([&](const handle_t& here) { // Get the id of the xg handle id_t xg_id = xg->get_id(here); // Get the sequence of the xg handle string xg_seq = xg->get_sequence(here); // Create a handle in vg using the xg id and sequence vg.create_handle(xg_seq,xg_id); }); // Iterate through each handle in xg xg->for_each_handle([&](const handle_t& handle) { id_t id = xg->get_id(handle); bool rev = xg->get_is_reverse(handle); // Return a vg handle using the xg handle's id and orientation handle_t current = vg.get_handle(id,rev); // Follow the right edges of the xg handle xg->follow_edges(handle, false, [&](const handle_t& r) { id_t id_r = xg->get_id(r); bool rev_r = xg->get_is_reverse(r); // Return a vg handle using the xg handle's id and orientation handle_t next = vg.get_handle(id_r, rev_r); // Create an edge in vg using the handles vg.create_edge(current,next); }); // Follow the left edges of the xg handle xg->follow_edges(handle, true, [&](const handle_t& l) { id_t id_l = xg->get_id(l); bool rev_l = xg->get_is_reverse(l); // Return a vg handle using the xg handle's id and orientation handle_t prev = vg.get_handle(id_l, rev_l); // Use the handles created from following the xg edges to create a vg edge vg.create_edge(prev,current); //error here }); }); return vg; }
// add all node traversals that valid walks from this one onto a stack void stack_up_valid_walks(VG& graph, NodeTraversal walk_head, vector<NodeTraversal>& stack) { id_t head_id = walk_head.node->id(); if (walk_head.backward) { // we are leaving from the start of the node // get all edges involving this node so we can filter them down to valid walks for (Edge* edge : graph.edges_of(walk_head.node)) { if (edge->from() == head_id && edge->from_start()) { // the edge is part of a valid walk Node* next_node = graph.get_node(edge->to()); bool next_backward = edge->to_end(); // add the next traversal in the walk to the stack stack.push_back(NodeTraversal(next_node, next_backward)); } else if (edge->to() == head_id && !edge->to_end()) { // the edge is part of a valid walk in the opposite orientation Node* next_node = graph.get_node(edge->from()); bool next_backward = edge->from_start(); // add the next traversal in the walk to the stack stack.push_back(NodeTraversal(next_node, next_backward)); } } } else { // we are leaving from the end of the node // get all edges involving this node so we can filter them down to valid walks for (Edge* edge : graph.edges_of(walk_head.node)) { if (edge->from() == head_id && !edge->from_start()) { // the edge is part of a valid walk Node* next_node = graph.get_node(edge->to()); bool next_backward = edge->to_end(); // add the next traversal in the walk to the stack stack.push_back(NodeTraversal(next_node, next_backward)); } else if (edge->to() == head_id && edge->to_end()) { // the edge is part of a valid walk in the opposite orientation Node* next_node = graph.get_node(edge->from()); bool next_backward = edge->from_start(); // add the next traversal in the walk to the stack stack.push_back(NodeTraversal(next_node, next_backward)); } } } }
int main_validate(int argc, char** argv) { if (argc <= 2) { help_validate(argv); return 1; } bool check_nodes = false; bool check_edges = false; bool check_orphans = false; bool check_paths = false; string xg_path; string gam_path; int c; optind = 2; // force optind past command positional argument while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"nodes", no_argument, 0, 'n'}, {"edges", no_argument, 0, 'e'}, {"paths", no_argument, 0, 'o'}, {"orphans", no_argument, 0, 'p'}, {"gam", required_argument, 0, 'a'}, {"xg", required_argument, 0, 'x'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hneopa:x:", long_options, &option_index); // Detect the end of the options. if (c == -1) break; switch (c) { case 'n': check_nodes = true; break; case 'e': check_edges = true; break; case 'o': check_orphans = true; break; case 'p': check_paths = true; break; case 'a': gam_path = optarg; break; case 'x': xg_path= optarg; break; case 'h': case '?': help_validate(argv); exit(1); break; default: abort (); } } if (!gam_path.empty() || !xg_path.empty()) { // GAM validation is its entirely own thing if (xg_path.empty()) { cerr << "error:[vg validate] xg index (-x) required with (-a)" << endl; return 1; } else if (gam_path.empty()) { cerr << "error:[vg validate] gam alignment (-a) required with (-x)" << endl; return 1; } else if (check_nodes || check_edges || check_orphans || check_paths) { cerr << "error:[vg validate] -n, -e -o, -p cannot be used with -a and -x" << endl; return 1; } ifstream in(xg_path.c_str()); unique_ptr<xg::XG> xindex = stream::VPKG::load_one<xg::XG>(in); in.close(); get_input_file(gam_path, [&](istream& in) { stream::for_each<Alignment>(in, [&](Alignment& aln) { if (!alignment_is_valid(aln, xindex.get())) { exit(1); } }); }); return 0; } else { VG* graph; get_input_file(optind, argc, argv, [&](istream& in) { graph = new VG(in); }); // if we chose a specific subset, do just them if (check_nodes || check_edges || check_orphans || check_paths) { if (graph->is_valid(check_nodes, check_edges, check_orphans, check_paths)) { return 0; } else { return 1; } // otherwise do everything } else if (graph->is_valid()) { return 0; } else { return 1; } } }
int main_find(int argc, char** argv) { if (argc == 2) { help_find(argv); return 1; } string db_name; string sequence; int kmer_size=0; int kmer_stride = 1; vector<string> kmers; vector<vg::id_t> node_ids; string node_list_file; int context_size=0; bool use_length = false; bool count_kmers = false; bool kmer_table = false; vector<string> targets; string path_name; bool position_in = false; bool rank_in = false; string range; string gcsa_in; string xg_name; bool get_mems = false; int mem_reseed_length = 0; bool use_fast_reseed = true; bool get_alignments = false; bool get_mappings = false; string node_id_range; string aln_on_id_range; vg::id_t start_id = 0; vg::id_t end_id = 0; bool pairwise_distance = false; string haplotype_alignments; string gam_file; int max_mem_length = 0; int min_mem_length = 1; string to_graph_file; bool extract_threads = false; vector<string> extract_patterns; vg::id_t approx_id = 0; int c; optind = 2; // force optind past command positional argument while (true) { static struct option long_options[] = { //{"verbose", no_argument, &verbose_flag, 1}, {"db-name", required_argument, 0, 'd'}, {"xg-name", required_argument, 0, 'x'}, {"gcsa", required_argument, 0, 'g'}, {"node", required_argument, 0, 'n'}, {"node-list", required_argument, 0, 'N'}, {"edges-end", required_argument, 0, 'e'}, {"edges-start", required_argument, 0, 's'}, {"kmer", required_argument, 0, 'k'}, {"table", no_argument, 0, 'T'}, {"sequence", required_argument, 0, 'S'}, {"mems", required_argument, 0, 'M'}, {"reseed-length", required_argument, 0, 'B'}, {"fast-reseed", no_argument, 0, 'f'}, {"kmer-stride", required_argument, 0, 'j'}, {"kmer-size", required_argument, 0, 'z'}, {"context", required_argument, 0, 'c'}, {"use-length", no_argument, 0, 'L'}, {"kmer-count", no_argument, 0, 'C'}, {"path", required_argument, 0, 'p'}, {"position-in", required_argument, 0, 'P'}, {"rank-in", required_argument, 0, 'R'}, {"node-range", required_argument, 0, 'r'}, {"alignments", no_argument, 0, 'a'}, {"mappings", no_argument, 0, 'm'}, {"alns-in", required_argument, 0, 'i'}, {"alns-on", required_argument, 0, 'o'}, {"distance", no_argument, 0, 'D'}, {"haplotypes", required_argument, 0, 'H'}, {"gam", required_argument, 0, 'G'}, {"to-graph", required_argument, 0, 'A'}, {"max-mem", required_argument, 0, 'Y'}, {"min-mem", required_argument, 0, 'Z'}, {"extract-threads", no_argument, 0, 't'}, {"threads-named", required_argument, 0, 'q'}, {"approx-pos", required_argument, 0, 'X'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "d:x:n:e:s:o:k:hc:LS:z:j:CTp:P:r:amg:M:R:B:fi:DH:G:N:A:Y:Z:tq:X:", long_options, &option_index); // Detect the end of the options. if (c == -1) break; switch (c) { case 'd': db_name = optarg; break; case 'x': xg_name = optarg; break; case 'g': gcsa_in = optarg; break; case 'k': kmers.push_back(optarg); break; case 'S': sequence = optarg; break; case 'M': sequence = optarg; get_mems = true; break; case 'B': mem_reseed_length = atoi(optarg); break; case 'f': use_fast_reseed = true; break; case 'Y': max_mem_length = atoi(optarg); break; case 'Z': min_mem_length = atoi(optarg); break; case 'j': kmer_stride = atoi(optarg); break; case 'z': kmer_size = atoi(optarg); break; case 'C': count_kmers = true; break; case 'p': targets.push_back(optarg); break; case 'P': path_name = optarg; position_in = true; break; case 'R': path_name = optarg; rank_in = true; break; case 'c': context_size = atoi(optarg); break; case 'L': use_length = true; break; case 'n': node_ids.push_back(atoi(optarg)); break; case 'N': node_list_file = optarg; break; case 'e': end_id = atoi(optarg); break; case 's': start_id = atoi(optarg); break; case 'T': kmer_table = true; break; case 'r': range = optarg; break; case 'a': get_alignments = true; break; case 'i': node_id_range = optarg; break; case 'm': get_mappings = true; break; case 'o': aln_on_id_range = optarg; break; case 'D': pairwise_distance = true; break; case 'H': haplotype_alignments = optarg; break; case 't': extract_threads = true; break; case 'q': extract_threads = true; extract_patterns.push_back(optarg); break; case 'X': approx_id = atoi(optarg); break; case 'G': gam_file = optarg; break; case 'A': to_graph_file = optarg; break; case 'h': case '?': help_find(argv); exit(1); break; default: abort (); } } if (optind < argc) { cerr << "[vg find] find does not accept positional arguments" << endl; return 1; } if (db_name.empty() && gcsa_in.empty() && xg_name.empty()) { cerr << "[vg find] find requires -d, -g, or -x to know where to find its database" << endl; return 1; } if (context_size > 0 && use_length == true && xg_name.empty()) { cerr << "[vg find] error, -L not supported without -x" << endl; exit(1); } if (xg_name.empty() && mem_reseed_length) { cerr << "error:[vg find] SMEM reseeding requires an XG index. Provide XG index with -x." << endl; exit(1); } // process input node list if (!node_list_file.empty()) { ifstream nli; nli.open(node_list_file); if (!nli.good()){ cerr << "[vg find] error, unable to open the node list input file." << endl; exit(1); } string line; while (getline(nli, line)){ for (auto& idstr : split_delims(line, " \t")) { node_ids.push_back(atol(idstr.c_str())); } } nli.close(); } // open index Index* vindex = nullptr; if (db_name.empty()) { assert(!gcsa_in.empty() || !xg_name.empty()); } else { vindex = new Index; vindex->open_read_only(db_name); } xg::XG xindex; if (!xg_name.empty()) { ifstream in(xg_name.c_str()); xindex.load(in); } if (get_alignments) { assert(!db_name.empty()); vector<Alignment> output_buf; auto lambda = [&output_buf](const Alignment& aln) { output_buf.push_back(aln); stream::write_buffered(cout, output_buf, 100); }; vindex->for_each_alignment(lambda); stream::write_buffered(cout, output_buf, 0); } if (!node_id_range.empty()) { assert(!db_name.empty()); vector<string> parts = split_delims(node_id_range, ":"); if (parts.size() == 1) { convert(parts.front(), start_id); end_id = start_id; } else { convert(parts.front(), start_id); convert(parts.back(), end_id); } vector<Alignment> output_buf; auto lambda = [&output_buf](const Alignment& aln) { output_buf.push_back(aln); stream::write_buffered(cout, output_buf, 100); }; vindex->for_alignment_in_range(start_id, end_id, lambda); stream::write_buffered(cout, output_buf, 0); } if (!aln_on_id_range.empty()) { assert(!db_name.empty()); vector<string> parts = split_delims(aln_on_id_range, ":"); if (parts.size() == 1) { convert(parts.front(), start_id); end_id = start_id; } else { convert(parts.front(), start_id); convert(parts.back(), end_id); } vector<vg::id_t> ids; for (auto i = start_id; i <= end_id; ++i) { ids.push_back(i); } vector<Alignment> output_buf; auto lambda = [&output_buf](const Alignment& aln) { output_buf.push_back(aln); stream::write_buffered(cout, output_buf, 100); }; vindex->for_alignment_to_nodes(ids, lambda); stream::write_buffered(cout, output_buf, 0); } if (!to_graph_file.empty()) { assert(vindex != nullptr); ifstream tgi(to_graph_file); VG graph(tgi); vector<vg::id_t> ids; graph.for_each_node([&](Node* n) { ids.push_back(n->id()); }); vector<Alignment> output_buf; auto lambda = [&output_buf](const Alignment& aln) { output_buf.push_back(aln); stream::write_buffered(cout, output_buf, 100); }; vindex->for_alignment_to_nodes(ids, lambda); stream::write_buffered(cout, output_buf, 0); } if (!xg_name.empty()) { if (!node_ids.empty() && path_name.empty() && !pairwise_distance) { // get the context of the node vector<Graph> graphs; set<vg::id_t> ids; for (auto node_id : node_ids) ids.insert(node_id); for (auto node_id : node_ids) { Graph g; xindex.neighborhood(node_id, context_size, g, !use_length); if (context_size == 0) { for (auto& edge : xindex.edges_of(node_id)) { // if both ends of the edge are in our targets, keep them if (ids.count(edge.to()) && ids.count(edge.from())) { *g.add_edge() = edge; } } } graphs.push_back(g); } VG result_graph; for (auto& graph : graphs) { // Allow duplicate nodes and edges (from e.g. multiple -n options); silently collapse them. result_graph.extend(graph); } result_graph.remove_orphan_edges(); // Order the mappings by rank. TODO: how do we handle breaks between // different sections of a path with a single name? result_graph.paths.sort_by_mapping_rank(); // return it result_graph.serialize_to_ostream(cout); } else if (end_id != 0) { for (auto& e : xindex.edges_on_end(end_id)) { cout << (e.from_start() ? -1 : 1) * e.from() << "\t" << (e.to_end() ? -1 : 1) * e.to() << endl; } } else if (start_id != 0) { for (auto& e : xindex.edges_on_start(start_id)) { cout << (e.from_start() ? -1 : 1) * e.from() << "\t" << (e.to_end() ? -1 : 1) * e.to() << endl; } } if (!node_ids.empty() && !path_name.empty() && !pairwise_distance && (position_in || rank_in)) { // Go get the positions of these nodes in this path if (xindex.path_rank(path_name) == 0) { // This path doesn't exist, and we'll get a segfault or worse if // we go look for positions in it. cerr << "[vg find] error, path \"" << path_name << "\" not found in index" << endl; exit(1); } // Note: this isn't at all consistent with -P option with rocksdb, which couts a range // and then mapping, but need this info right now for scripts/chunked_call for (auto node_id : node_ids) { cout << node_id; for (auto r : (position_in ? xindex.position_in_path(node_id, path_name) : xindex.node_ranks_in_path(node_id, path_name))) { cout << "\t" << r; } cout << endl; } } if (pairwise_distance) { if (node_ids.size() != 2) { cerr << "[vg find] error, exactly 2 nodes (-n) required with -D" << endl; exit(1); } cout << xindex.min_approx_path_distance(node_ids[0], node_ids[1]) << endl; return 0; } if (approx_id != 0) { cout << xindex.node_start(approx_id) << endl; return 0; } if (!targets.empty()) { Graph graph; for (auto& target : targets) { // Grab each target region string name; int64_t start, end; xg::parse_region(target, name, start, end); if(xindex.path_rank(name) == 0) { // Passing a nonexistent path to get_path_range produces Undefined Behavior cerr << "[vg find] error, path " << name << " not found in index" << endl; exit(1); } // no coordinates given, we do whole thing (0,-1) if (start < 0 && end < 0) { start = 0; } xindex.get_path_range(name, start, end, graph); } if (context_size > 0) { xindex.expand_context(graph, context_size, true, !use_length); } VG vgg; vgg.extend(graph); // removes dupes // Order the mappings by rank. TODO: how do we handle breaks between // different sections of a path with a single name? vgg.paths.sort_by_mapping_rank(); vgg.serialize_to_ostream(cout); } if (!range.empty()) { Graph graph; int64_t id_start=0, id_end=0; vector<string> parts = split_delims(range, ":"); if (parts.size() == 1) { cerr << "[vg find] error, format of range must be \"N:M\" where start id is N and end id is M, got " << range << endl; exit(1); } convert(parts.front(), id_start); convert(parts.back(), id_end); if (!use_length) { xindex.get_id_range(id_start, id_end, graph); } else { // treat id_end as length instead. xindex.get_id_range_by_length(id_start, id_end, graph, true); } if (context_size > 0) { xindex.expand_context(graph, context_size, true, !use_length); } VG vgg; vgg.extend(graph); // removes dupes vgg.remove_orphan_edges(); vgg.serialize_to_ostream(cout); } if(!haplotype_alignments.empty()) { // What should we do with each alignment? function<void(Alignment&)> lambda = [&xindex](Alignment& aln) { // Count the amtches to the path. The path might be empty, in // which case it will yield the biggest size_t you can have. size_t matches = xindex.count_matches(aln.path()); // We do this single-threaded, at least for now, so we don't // need to worry about coordinating output, and we can just // spit out the counts as bare numbers. cout << matches << endl; }; if (haplotype_alignments == "-") { stream::for_each(std::cin, lambda); } else { ifstream in; in.open(haplotype_alignments.c_str()); if(!in.is_open()) { cerr << "[vg find] error: could not open alignments file " << haplotype_alignments << endl; exit(1); } stream::for_each(in, lambda); } } if (extract_threads) { size_t thread_number = 0; bool extract_reverse = false; map<string, list<xg::XG::thread_t> > threads; if (extract_patterns.empty()) { threads = xindex.extract_threads(extract_reverse); } else { for (auto& pattern : extract_patterns) { for (auto& t : xindex.extract_threads_matching(pattern, extract_reverse)) { threads[t.first] = t.second; } } } for(auto t : threads) { // Convert to a Path auto& thread = *t.second.begin(); auto& thread_name = t.first; Path path; for(xg::XG::ThreadMapping& m : thread) { // Convert all the mappings Mapping mapping; mapping.mutable_position()->set_node_id(m.node_id); mapping.mutable_position()->set_is_reverse(m.is_reverse); *(path.add_mapping()) = mapping; } // Get each thread's name path.set_name(thread_name); // Give each thread a name //path.set_name("_thread_" + to_string(thread_number++)); // We need a Graph for serialization purposes. We do one chunk per // thread in case the threads are long. Graph g; *(g.add_path()) = path; // Dump the graph with its mappings. TODO: can we restrict these to vector<Graph> gb = { g }; stream::write_buffered(cout, gb, 0); } } if (!gam_file.empty()) { set<vg::id_t> nodes; function<void(Alignment&)> lambda = [&nodes](Alignment& aln) { // accumulate nodes matched by the path auto& path = aln.path(); for (int i = 0; i < path.mapping_size(); ++i) { nodes.insert(path.mapping(i).position().node_id()); } }; if (gam_file == "-") { stream::for_each(std::cin, lambda); } else { ifstream in; in.open(gam_file.c_str()); if(!in.is_open()) { cerr << "[vg find] error: could not open alignments file " << gam_file << endl; exit(1); } stream::for_each(in, lambda); } // now we have the nodes to get Graph graph; for (auto& node : nodes) { *graph.add_node() = xindex.node(node); } xindex.expand_context(graph, max(1, context_size), true); // get connected edges VG vgg; vgg.extend(graph); vgg.serialize_to_ostream(cout); } } else if (!db_name.empty()) { if (!node_ids.empty() && path_name.empty()) { // get the context of the node vector<VG> graphs; for (auto node_id : node_ids) { VG g; vindex->get_context(node_id, g); if (context_size > 0) { vindex->expand_context(g, context_size); } graphs.push_back(g); } VG result_graph; for (auto& graph : graphs) { // Allow duplicate nodes and edges (from e.g. multiple -n options); silently collapse them. result_graph.extend(graph); } result_graph.remove_orphan_edges(); // return it result_graph.serialize_to_ostream(cout); } else if (end_id != 0) { vector<Edge> edges; vindex->get_edges_on_end(end_id, edges); for (vector<Edge>::iterator e = edges.begin(); e != edges.end(); ++e) { cout << (e->from_start() ? -1 : 1) * e->from() << "\t" << (e->to_end() ? -1 : 1) * e->to() << endl; } } else if (start_id != 0) { vector<Edge> edges; vindex->get_edges_on_start(start_id, edges); for (vector<Edge>::iterator e = edges.begin(); e != edges.end(); ++e) { cout << (e->from_start() ? -1 : 1) * e->from() << "\t" << (e->to_end() ? -1 : 1) * e->to() << endl; } } if (!node_ids.empty() && !path_name.empty()) { int64_t path_id = vindex->get_path_id(path_name); for (auto node_id : node_ids) { list<pair<int64_t, bool>> path_prev, path_next; int64_t prev_pos=0, next_pos=0; bool prev_backward, next_backward; if (vindex->get_node_path_relative_position(node_id, false, path_id, path_prev, prev_pos, prev_backward, path_next, next_pos, next_backward)) { // Negate IDs for backward nodes cout << node_id << "\t" << path_prev.front().first * (path_prev.front().second ? -1 : 1) << "\t" << prev_pos << "\t" << path_next.back().first * (path_next.back().second ? -1 : 1) << "\t" << next_pos << "\t"; Mapping m = vindex->path_relative_mapping(node_id, false, path_id, path_prev, prev_pos, prev_backward, path_next, next_pos, next_backward); cout << pb2json(m) << endl; } } } if (!targets.empty()) { VG graph; for (auto& target : targets) { string name; int64_t start, end; xg::parse_region(target, name, start, end); // end coordinate is exclusive for get_path() if (end >= 0) { ++end; } vindex->get_path(graph, name, start, end); } if (context_size > 0) { vindex->expand_context(graph, context_size); } graph.remove_orphan_edges(); graph.serialize_to_ostream(cout); } if (!range.empty()) { VG graph; int64_t id_start=0, id_end=0; vector<string> parts = split_delims(range, ":"); if (parts.size() == 1) { cerr << "[vg find] error, format of range must be \"N:M\" where start id is N and end id is M, got " << range << endl; exit(1); } convert(parts.front(), id_start); convert(parts.back(), id_end); vindex->get_range(id_start, id_end, graph); if (context_size > 0) { vindex->expand_context(graph, context_size); } graph.remove_orphan_edges(); graph.serialize_to_ostream(cout); } } // todo cleanup if/else logic to allow only one function if (!sequence.empty()) { if (gcsa_in.empty()) { if (get_mems) { cerr << "error:[vg find] a GCSA index must be passed to get MEMs" << endl; return 1; } set<int> kmer_sizes = vindex->stored_kmer_sizes(); if (kmer_sizes.empty()) { cerr << "error:[vg find] index does not include kmers, add with vg index -k" << endl; return 1; } if (kmer_size == 0) { kmer_size = *kmer_sizes.begin(); } for (int i = 0; i <= sequence.size()-kmer_size; i+=kmer_stride) { kmers.push_back(sequence.substr(i,kmer_size)); } } else { // let's use the GCSA index // Configure GCSA2 verbosity so it doesn't spit out loads of extra info gcsa::Verbosity::set(gcsa::Verbosity::SILENT); // Configure its temp directory to the system temp directory gcsa::TempFile::setDirectory(find_temp_dir()); // Open it ifstream in_gcsa(gcsa_in.c_str()); gcsa::GCSA gcsa_index; gcsa_index.load(in_gcsa); gcsa::LCPArray lcp_index; // default LCP is the gcsa base name +.lcp string lcp_in = gcsa_in + ".lcp"; ifstream in_lcp(lcp_in.c_str()); lcp_index.load(in_lcp); //range_type find(const char* pattern, size_type length) const; //void locate(size_type path, std::vector<node_type>& results, bool append = false, bool sort = true) const; //locate(i, results); if (!get_mems) { auto paths = gcsa_index.find(sequence.c_str(), sequence.length()); //cerr << paths.first << " - " << paths.second << endl; for (gcsa::size_type i = paths.first; i <= paths.second; ++i) { std::vector<gcsa::node_type> ids; gcsa_index.locate(i, ids); for (auto id : ids) { cout << gcsa::Node::decode(id) << endl; } } } else { // for mems we need to load up the gcsa and lcp structures into the mapper Mapper mapper(&xindex, &gcsa_index, &lcp_index); mapper.fast_reseed = use_fast_reseed; // get the mems double lcp_max, fraction_filtered; auto mems = mapper.find_mems_deep(sequence.begin(), sequence.end(), lcp_max, fraction_filtered, max_mem_length, min_mem_length, mem_reseed_length); // dump them to stdout cout << mems_to_json(mems) << endl; } } } if (!kmers.empty()) { if (count_kmers) { for (auto& kmer : kmers) { cout << kmer << "\t" << vindex->approx_size_of_kmer_matches(kmer) << endl; } } else if (kmer_table) { for (auto& kmer : kmers) { map<string, vector<pair<int64_t, int32_t> > > positions; vindex->get_kmer_positions(kmer, positions); for (auto& k : positions) { for (auto& p : k.second) { cout << k.first << "\t" << p.first << "\t" << p.second << endl; } } } } else { vector<VG> graphs; for (auto& kmer : kmers) { VG g; vindex->get_kmer_subgraph(kmer, g); if (context_size > 0) { vindex->expand_context(g, context_size); } graphs.push_back(g); } VG result_graph; for (auto& graph : graphs) { // Allow duplicate nodes and edges (from multiple kmers); silently collapse them. result_graph.extend(graph); } result_graph.remove_orphan_edges(); result_graph.serialize_to_ostream(cout); } } if (vindex) delete vindex; return 0; }
int main_xg(int argc, char** argv) { if (argc == 2) { help_xg(argv); return 1; } string vg_in; string vg_out; string out_name; string in_name; int64_t node_id; bool edges_from = false; bool edges_to = false; bool edges_of = false; bool edges_on_start = false; bool edges_on_end = false; bool node_sequence = false; string pos_for_char; string pos_for_substr; int context_steps = 0; bool node_context = false; string target; bool print_graph = false; bool text_output = false; bool validate_graph = false; bool extract_threads = false; bool store_threads = false; bool is_sorted_dag = false; string report_name; string b_array_name; int c; optind = 2; // force optind past "xg" positional argument while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"vg", required_argument, 0, 'v'}, {"out", required_argument, 0, 'o'}, {"in", required_argument, 0, 'i'}, {"extract-vg", required_argument, 0, 'X'}, {"node", required_argument, 0, 'n'}, {"char", required_argument, 0, 'P'}, {"substr", required_argument, 0, 'F'}, //{"range", required_argument, 0, 'r'}, {"context", required_argument, 0, 'c'}, {"edges-from", required_argument, 0, 'f'}, {"edges-to", required_argument, 0, 't'}, {"edges-of", required_argument, 0, 'O'}, {"edges-on-start", required_argument, 0, 'S'}, {"edges-on-end", required_argument, 0, 'E'}, {"node-seq", required_argument, 0, 's'}, {"path", required_argument, 0, 'p'}, {"extract-threads", no_argument, 0, 'x'}, {"store-threads", no_argument, 0, 'r'}, {"is-sorted-dag", no_argument, 0, 'd'}, {"report", required_argument, 0, 'R'}, {"debug", no_argument, 0, 'D'}, {"text-output", no_argument, 0, 'T'}, {"validate", no_argument, 0, 'V'}, {"dump-bs", required_argument, 0, 'b'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hv:o:i:X:f:t:s:c:n:p:DxrdTO:S:E:VR:P:F:b:", long_options, &option_index); // Detect the end of the options. if (c == -1) break; switch (c) { case 'v': vg_in = optarg; break; case 'V': validate_graph = true; break; case 'o': out_name = optarg; break; case 'D': print_graph = true; break; case 'T': text_output = true; break; case 'x': extract_threads = true; break; case 'r': store_threads = true; break; case 'd': is_sorted_dag = true; break; case 'i': in_name = optarg; break; case 'X': vg_out = optarg; break; case 'n': node_id = parse<int64_t>(optarg); node_context = true; break; case 'c': context_steps = parse<int>(optarg); break; case 'f': node_id = parse<int64_t>(optarg); edges_from = true; break; case 't': node_id = parse<int64_t>(optarg); edges_to = true; break; case 'O': node_id = parse<int64_t>(optarg); edges_of = true; break; case 'S': node_id = parse<int64_t>(optarg); edges_on_start = true; break; case 'E': node_id = parse<int64_t>(optarg); edges_on_end = true; break; case 's': node_id = parse<int64_t>(optarg); node_sequence = true; break; case 'p': target = optarg; break; case 'P': pos_for_char = optarg; break; case 'F': pos_for_substr = optarg; break; case 'R': report_name = optarg; break; case 'b': b_array_name = optarg; break; case 'h': case '?': help_xg(argv); exit(1); break; default: abort (); } } unique_ptr<XG> graph; //string file_name = argv[optind]; if (in_name.empty()) assert(!vg_in.empty()); if (vg_in == "-") { // Read VG from stdin graph = unique_ptr<XG>(new XG()); graph->from_stream(std::cin, validate_graph, print_graph, store_threads, is_sorted_dag); } else if (vg_in.size()) { // Read VG from a file ifstream in; in.open(vg_in.c_str()); graph = unique_ptr<XG>(new XG()); graph->from_stream(in, validate_graph, print_graph, store_threads, is_sorted_dag); } if (in_name.size()) { get_input_file(in_name, [&](istream& in) { // Load from an XG file or - (stdin) graph = stream::VPKG::load_one<XG>(in); }); } // Prepare structure tree for serialization unique_ptr<sdsl::structure_tree_node> structure; if (!report_name.empty()) { // We need to make a report, so we need the structure. Make a real tree // node. The unique_ptr handles deleting. structure = unique_ptr<sdsl::structure_tree_node>(new sdsl::structure_tree_node("name", "type")); } if(!vg_out.empty()) { if (graph.get() == nullptr) { cerr << "error [vg xg] no xg graph exists to convert; Try: vg xg -i graph.xg -X graph.vg" << endl; return 1; } VG converted; // Convert the xg graph to vg format convert_handle_graph(graph.get(), &converted); // TODO: The converter doesn't copy circular paths yet. // When it does, we can remove all this path copying code. // Make a raw Proto Graph to hold Path objects Graph path_graph; // Since paths are not copied, copy the paths. for (size_t rank = 1; rank <= graph->max_path_rank(); rank++) { // Extract each path into the path graph *path_graph.add_path() = graph->path(graph->path_name(rank)); } // Merge in all the paths converted.extend(path_graph); if (vg_out == "-") { converted.serialize_to_ostream(std::cout); } else { converted.serialize_to_file(vg_out); } } if (!out_name.empty()) { // Open a destination file if it is a file we want to write to ofstream out_file; if (out_name != "-") { out_file.open(out_name); } // Work out where to save to ostream& out = (out_name == "-") ? std::cout : out_file; // Encapsulate output in VPKG stream::VPKG::with_save_stream(out, "XG", [&](ostream& tagged) { // Serialize to the file while recording space usage to the structure. graph->serialize(tagged, structure.get(), "xg"); }); out.flush(); } if (!report_name.empty()) { // Save the report ofstream out; out.open(report_name.c_str()); sdsl::write_structure_tree<HTML_FORMAT>(structure.get(), out, 0); } // queries if (node_sequence) { cout << node_id << ": " << graph->node_sequence(node_id) << endl; } if (!pos_for_char.empty()) { // extract the position from the string int64_t id; bool is_rev; size_t off; extract_pos(pos_for_char, id, is_rev, off); // then pick it up from the graph cout << graph->pos_char(id, is_rev, off) << endl; } if (!pos_for_substr.empty()) { int64_t id; bool is_rev; size_t off; size_t len; extract_pos_substr(pos_for_substr, id, is_rev, off, len); cout << graph->pos_substr(id, is_rev, off, len) << endl; } if (edges_from) { vector<Edge> edges = graph->edges_from(node_id); for (auto& edge : edges) { cout << edge.from() << (edge.from_start()?"-":"+") << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; } } if (edges_to) { vector<Edge> edges = graph->edges_to(node_id); for (auto& edge : edges) { cout << edge.from() << (edge.from_start()?"-":"+") << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; } } if (edges_of) { vector<Edge> edges = graph->edges_of(node_id); for (auto& edge : edges) { cout << edge.from() << (edge.from_start()?"-":"+") << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; } } if (edges_on_start) { vector<Edge> edges = graph->edges_on_start(node_id); for (auto& edge : edges) { cout << edge.from() << (edge.from_start()?"-":"+") << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; } } if (edges_on_end) { vector<Edge> edges = graph->edges_on_end(node_id); for (auto& edge : edges) { cout << edge.from() << (edge.from_start()?"-":"+") << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; } } if (node_context) { Graph g; graph->neighborhood(node_id, context_steps, g); if (text_output) { to_text(cout, g); } else { vector<Graph> gb = { g }; stream::write_buffered(cout, gb, 0); } } if (!target.empty()) { string name; int64_t start, end; Graph g; parse_region(target, name, start, end); graph->get_path_range(name, start, end, g); graph->expand_context(g, context_steps); if (text_output) { to_text(cout, g); } else { vector<Graph> gb = { g }; stream::write_buffered(cout, gb, 0); } } if (extract_threads) { list<XG::thread_t> threads; for (auto& p : graph->extract_threads(false)) { for (auto& t : p.second) { threads.push_back(t); } } for (auto& p : graph->extract_threads(true)) { for (auto& t : p.second) { threads.push_back(t); } } size_t thread_number = 0; for(XG::thread_t& thread : threads) { // Convert to a Path Path path; for(XG::ThreadMapping& m : thread) { // Convert all the mappings Mapping mapping; mapping.mutable_position()->set_node_id(m.node_id); mapping.mutable_position()->set_is_reverse(m.is_reverse); *(path.add_mapping()) = mapping; } // Give each thread a name path.set_name("_thread_" + to_string(thread_number++)); // We need a Graph for serialization purposes. We do one chunk per // thread in case the threads are long. Graph g; *(g.add_path()) = path; // Dump the graph with its mappings. TODO: can we restrict these to // mappings to nodes we have already pulled out? Or pull out the // whole compressed graph? if (text_output) { to_text(cout, g); } else { vector<Graph> gb = { g }; stream::write_buffered(cout, gb, 0); } } } if (!b_array_name.empty()) { // Dump B array ofstream out; out.open(b_array_name.c_str()); graph->bs_dump(out); } return 0; }
PathIndex::PathIndex(const list<Mapping>& mappings, VG& vg) { // Trace the given path in the given VG graph, collecting sequence // We're going to build the sequence string std::stringstream seq_stream; // What base are we at in the path? size_t path_base = 0; // What was the last rank? Ranks must always go up. int64_t last_rank = -1; for (auto& mapping : mappings) { if (!by_id.count(mapping.position().node_id())) { // This is the first time we have visited this node in the path. // Add in a mapping. by_id[mapping.position().node_id()] = std::make_pair(path_base, mapping.position().is_reverse()); #ifdef debug #pragma omp critical (cerr) std::cerr << "Node " << mapping.position().node_id() << " rank " << mapping.rank() << " starts at base " << path_base << " with " << vg.get_node(mapping.position().node_id())->sequence() << std::endl; #endif // Make sure ranks are monotonically increasing along the path, or // unset. assert(mapping.rank() > last_rank || (mapping.rank() == 0 && last_rank == 0)); last_rank = mapping.rank(); } // Say that this node appears here along the reference in this // orientation. by_start[path_base] = NodeSide(mapping.position().node_id(), mapping.position().is_reverse()); // Remember that occurrence by node ID. node_occurrences[mapping.position().node_id()].push_back(by_start.find(path_base)); // Say this Mapping happens at this base along the path mapping_positions[&mapping] = path_base; // Find the node's sequence std::string node_sequence = vg.get_node(mapping.position().node_id())->sequence(); while(path_base == 0 && node_sequence.size() > 0 && (node_sequence[0] != 'A' && node_sequence[0] != 'T' && node_sequence[0] != 'C' && node_sequence[0] != 'G' && node_sequence[0] != 'N')) { // If the path leads with invalid characters (like "X"), throw them // out when computing path positions. // TODO: this is a hack to deal with the debruijn-brca1-k63 graph, // which leads with an X. #pragma omp critical (cerr) std::cerr << "Warning: dropping invalid leading character " << node_sequence[0] << " from node " << mapping.position().node_id() << std::endl; node_sequence.erase(node_sequence.begin()); } if (mapping.position().is_reverse()) { // Put the reverse sequence in the path seq_stream << reverse_complement(node_sequence); } else { // Put the forward sequence in the path seq_stream << node_sequence; } // Whether we found the right place for this node in the reference or // not, we still need to advance along the reference path. We assume the // whole node (except any leading bogus characters) is included in the // path (since it sort of has to be, syntactically, unless it's the // first or last node). path_base += node_sequence.size(); // TODO: handle leading bogus characters in calls on the first node. } // Record the length of the last mapping's node, since there's no next mapping to work it out from last_node_length = mappings.empty() ? 0 : vg.get_node(mappings.back().position().node_id())->sequence().size(); // Create the actual reference sequence we will use sequence = seq_stream.str(); #ifdef debug // Announce progress. #pragma omp critical (cerr) std::cerr << "Traced " << path_base << " bp path." << std::endl; if (sequence.size() < 100) { #pragma omp critical (cerr) std::cerr << "Sequence: " << sequence << std::endl; } #endif // Follow the path (again) and place all its Mappings }
TEST(inequality, Point) { CHECK(Point(1, 2) != Point(3, 4)); }
TEST(equality, Point) { CHECK_EQUAL(Point(1, 2), Point(1, 2)); }
int main_validate(int argc, char** argv) { if (argc <= 2) { help_validate(argv); return 1; } bool check_nodes = false; bool check_edges = false; bool check_orphans = false; bool check_paths = false; int c; optind = 2; // force optind past command positional argument while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"nodes", no_argument, 0, 'n'}, {"edges", no_argument, 0, 'e'}, {"paths", no_argument, 0, 'o'}, {"orphans", no_argument, 0, 'p'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hneop", long_options, &option_index); // Detect the end of the options. if (c == -1) break; switch (c) { case 'n': check_nodes = true; break; case 'e': check_edges = true; break; case 'o': check_orphans = true; break; case 'p': check_paths = true; break; case 'h': case '?': help_validate(argv); exit(1); break; default: abort (); } } VG* graph; get_input_file(optind, argc, argv, [&](istream& in) { graph = new VG(in); }); // if we chose a specific subset, do just them if (check_nodes || check_edges || check_orphans || check_paths) { if (graph->is_valid(check_nodes, check_edges, check_orphans, check_paths)) { return 0; } else { return 1; } // otherwise do everything } else if (graph->is_valid()) { return 0; } else { return 1; } }
int main_mod(int argc, char** argv) { if (argc == 2) { help_mod(argv); return 1; } string path_name; bool remove_orphans = false; string aln_file; string loci_file; bool called_genotypes_only = false; bool label_paths = false; bool compact_ids = false; bool prune_complex = false; int path_length = 0; int edge_max = 0; int chop_to = 0; bool add_start_and_end_markers = false; bool prune_subgraphs = false; bool kill_labels = false; bool simplify_graph = false; bool unchop = false; bool normalize_graph = false; bool remove_non_path = false; bool remove_path = false; bool compact_ranks = false; bool drop_paths = false; set<string> paths_to_retain; bool retain_complement = false; vector<int64_t> root_nodes; int32_t context_steps; bool remove_null; bool strong_connect = false; uint32_t unfold_to = 0; bool break_cycles = false; uint32_t dagify_steps = 0; uint32_t dagify_to = 0; uint32_t dagify_component_length_max = 0; bool orient_forward = false; int64_t destroy_node_id = 0; bool bluntify = false; int until_normal_iter = 0; string translation_file; bool flip_doubly_reversed_edges = false; bool cactus = false; string vcf_filename; string loci_filename; int max_degree = 0; int c; optind = 2; // force optind past command positional argument while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"include-aln", required_argument, 0, 'i'}, {"include-loci", required_argument, 0, 'q'}, {"include-gt", required_argument, 0, 'Q'}, {"compact-ids", no_argument, 0, 'c'}, {"compact-ranks", no_argument, 0, 'C'}, {"drop-paths", no_argument, 0, 'D'}, {"keep-path", required_argument, 0, 'k'}, {"remove-orphans", no_argument, 0, 'o'}, {"prune-complex", no_argument, 0, 'p'}, {"prune-subgraphs", no_argument, 0, 'S'}, {"length", required_argument, 0, 'l'}, {"edge-max", required_argument, 0, 'e'}, {"chop", required_argument, 0, 'X'}, {"kill-labels", no_argument, 0, 'K'}, {"markers", no_argument, 0, 'm'}, {"threads", no_argument, 0, 't'}, {"label-paths", no_argument, 0, 'P'}, {"simplify", no_argument, 0, 's'}, {"unchop", no_argument, 0, 'u'}, {"normalize", no_argument, 0, 'n'}, {"until-normal", required_argument, 0, 'U'}, {"remove-non-path", no_argument, 0, 'N'}, {"remove-path", no_argument, 0, 'A'}, {"orient-forward", no_argument, 0, 'O'}, {"unfold", required_argument, 0, 'f'}, {"retain-path", required_argument, 0, 'r'}, {"retain-complement", no_argument, 0, 'I'}, {"subgraph", required_argument, 0, 'g'}, {"context", required_argument, 0, 'x'}, {"remove-null", no_argument, 0, 'R'}, {"strong-connect", no_argument, 0, 'T'}, {"dagify-steps", required_argument, 0, 'd'}, {"dagify-to", required_argument, 0, 'w'}, {"dagify-len-max", required_argument, 0, 'L'}, {"bluntify", no_argument, 0, 'B'}, {"break-cycles", no_argument, 0, 'b'}, {"orient-forward", no_argument, 0, 'O'}, {"destroy-node", required_argument, 0, 'y'}, {"translation", required_argument, 0, 'Z'}, {"unreverse-edges", required_argument, 0, 'E'}, {"cactus", no_argument, 0, 'a'}, {"sample-vcf", required_argument, 0, 'v'}, {"sample-graph", required_argument, 0, 'G'}, {"max-degree", required_argument, 0, 'M'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hk:oi:q:Q:cpl:e:mt:SX:KPsunzNAf:CDr:Ig:x:RTU:Bbd:Ow:L:y:Z:Eav:G:M:", long_options, &option_index); // Detect the end of the options. if (c == -1) break; switch (c) { case 'i': cerr << "[vg mod] warning: vg mod -i is deprecated and will soon be removed. please switch to vg augment" << endl; aln_file = optarg; break; case 'q': cerr << "[vg mod] warning: vg mod -q is deprecated and will soon be removed. please switch to vg augment -l" << endl; loci_file = optarg; break; case 'Q': cerr << "[vg mod] warning: vg mod -l is deprecated and will soon be removed. please switch to vg augment -L" << endl; loci_file = optarg; called_genotypes_only = true; break; case 'Z': cerr << "[vg mod] warning: vg mod -Z is deprecated and will soon be removed. please switch to vg augment -Z" << endl; translation_file = optarg; break; case 'c': compact_ids = true; break; case 'C': compact_ranks = true; break; case 'k': path_name = optarg; break; case 'r': paths_to_retain.insert(optarg); break; case 'I': retain_complement = true; break; case 'o': remove_orphans = true; break; case 'p': prune_complex = true; break; case 'S': prune_subgraphs = true; break; case 'l': path_length = parse<int>(optarg); break; case 'X': chop_to = parse<int>(optarg); break; case 'u': unchop = true; break; case 'E': flip_doubly_reversed_edges = true; break; case 'K': kill_labels = true; break; case 'e': edge_max = parse<int>(optarg); break; case 'm': add_start_and_end_markers = true; break; case 't': omp_set_num_threads(parse<int>(optarg)); break; case 'f': unfold_to = parse<int>(optarg); break; case 'O': orient_forward = true; break; case 'P': cerr << "[vg mod] warning: vg mod -P is deprecated and will soon be removed. please switch to vg augment -B" << endl; label_paths = true; break; case 'D': drop_paths = true; break; case 's': simplify_graph = true; break; case 'n': normalize_graph = true; break; case 'N': remove_non_path = true; break; case 'A': remove_path = true; break; case 'T': strong_connect = true; break; case 'U': until_normal_iter = parse<int>(optarg); break; case 'd': dagify_steps = parse<int>(optarg); break; case 'w': dagify_to = parse<int>(optarg); break; case 'L': dagify_component_length_max = parse<int>(optarg); break; case 'B': bluntify = true; break; case 'b': break_cycles = true; break; case 'g': root_nodes.push_back(parse<int>(optarg)); break; case 'x': context_steps = parse<int>(optarg); break; case 'R': remove_null = true; break; case 'y': destroy_node_id = parse<int>(optarg); break; case 'a': cactus = true; break; case 'v': vcf_filename = optarg; break; case 'G': loci_filename = optarg; break; case 'M': max_degree = parse<int>(optarg); break; case 'h': case '?': help_mod(argv); exit(1); break; default: abort (); } } VG* graph; get_input_file(optind, argc, argv, [&](istream& in) { graph = new VG(in); }); if (retain_complement) { // Compute the actual paths to retain set<string> complement; graph->paths.for_each_name([&](const string& name) { if (!paths_to_retain.count(name)) { // Complement the set the user specified by putting in all the // paths they didn't mention. complement.insert(name); } }); // Retain the complement of what we were asking for. paths_to_retain = complement; } if (!vcf_filename.empty()) { // We need to throw out the parts of the graph that are on alt paths, // but not on alt paths for alts used by the first sample in the VCF. // This is called with the entire path name string to detect alt // paths. const function<bool(const string&)>& is_alt = Paths::is_alt; // This holds the VCF file we read the variants from. It needs to be the // same one used to construct the graph. vcflib::VariantCallFile variant_file; variant_file.open(vcf_filename); if (!variant_file.is_open()) { cerr << "error:[vg mod] could not open" << vcf_filename << endl; return 1; } // Now go through and prune down the varaints. // How many phases are there? size_t num_samples = variant_file.sampleNames.size(); // TODO: we can only handle single-sample VCFs assert(num_samples == 1); // This will hold the IDs of all nodes visited by alt paths that aren't used. set<vg::id_t> alt_path_ids; graph->paths.for_each_name([&](const string& alt_path_name) { // For every path name in the graph if(is_alt(alt_path_name)) { // If it's an alt path for(auto& mapping : graph->paths.get_path(alt_path_name)) { // Mark all nodes that are part of it as on alt paths alt_path_ids.insert(mapping.node_id()); } } }); // We also have a function to handle each variant as it comes in. auto handle_variant = [&](vcflib::Variant& variant) { // So we have a variant if(variant.alleles.size() < 2) { // Skip non-variable variants. return; } // Grab its id, or make one by hashing stuff if it doesn't // have an ID. string var_name = make_variant_id(variant); if(!graph->paths.has_path("_alt_" + var_name + "_0")) { // There isn't a reference alt path for this variant. Someone messed up. cerr << variant << endl; throw runtime_error("Reference alt for " + var_name + " not in graph!"); } // For now always work on sample 0. TODO: let the user specify a // name and find it. int sample_number = 0; // What sample is it? string& sample_name = variant_file.sampleNames[sample_number]; // Parse it out and see if it's phased. string genotype = variant.getGenotype(sample_name); // Tokenize into allele numbers // The token iterator can't hold the regex regex allele_separator("[|/]"); for (sregex_token_iterator it(genotype.begin(), genotype.end(), allele_separator, -1); it != sregex_token_iterator(); ++it) { // For every token separated by / or | int allele_number; if(it->str() == ".") { // Unknown; pretend it's ref for the purposes of making a // sample graph. allele_number = 0; } else { // Parse the allele number allele_number = stoi(it->str()); } // Make the name for its alt path string alt_path_name = "_alt_" + var_name + "_" + to_string(allele_number); for(auto& mapping : graph->paths.get_path(alt_path_name)) { // Un-mark all nodes that are on this alt path, since it is used by the sample. alt_path_ids.erase(mapping.node_id()); } } }; // Allocate a place to store actual variants vcflib::Variant var(variant_file); while (variant_file.is_open() && variant_file.getNextVariant(var)) { // this ... maybe we should remove it as for when we have calls against N bool isDNA = allATGC(var.ref); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { if (!allATGC(*a)) isDNA = false; } // only work with DNA sequences if (!isDNA) { continue; } // Handle the variant handle_variant(var); } for(auto& node_id : alt_path_ids) { // And delete all the nodes that were used by alt paths that weren't // in the genotype of the first sample. for(auto& path_name : graph->paths.of_node(node_id)) { // For every path that touches the node we're destroying, // destroy the path. We can't leave it because it won't be the // same path without this node. graph->paths.remove_path(path_name); #ifdef debug cerr << "Node " << node_id << " was on path " << path_name << endl; #endif } // Actually get rid of the node once its paths are gone. graph->destroy_node(node_id); } } if (!loci_filename.empty()) { // Open the file ifstream loci_file(loci_filename); assert(loci_file.is_open()); // What nodes and edges are called as present by the loci? set<Node*> called_nodes; set<Edge*> called_edges; function<void(Locus&)> lambda = [&](Locus& locus) { // For each locus if (locus.genotype_size() == 0) { // No call made here. Just remove all the nodes/edges. TODO: // should we keep them all if we don't know if they're there or // not? Or should the caller call ref with some low confidence? return; } const Genotype& gt = locus.genotype(0); for (size_t j = 0; j < gt.allele_size(); j++) { // For every allele called as present int allele_number = gt.allele(j); const Path& allele = locus.allele(allele_number); for (size_t i = 0; i < allele.mapping_size(); i++) { // For every Mapping in the allele const Mapping& m = allele.mapping(i); // Remember to keep this node called_nodes.insert(graph->get_node(m.position().node_id())); if (i + 1 < allele.mapping_size()) { // Look at the next mapping, which exists const Mapping& m2 = allele.mapping(i + 1); // Find the edge from the last Mapping's node to this one and mark it as used called_edges.insert(graph->get_edge(NodeSide(m.position().node_id(), !m.position().is_reverse()), NodeSide(m2.position().node_id(), m2.position().is_reverse()))); } } } }; vg::io::for_each(loci_file, lambda); // Collect all the unused nodes and edges (so we don't try to delete // while iterating...) set<Node*> unused_nodes; set<Edge*> unused_edges; graph->for_each_node([&](Node* n) { if (!called_nodes.count(n)) { unused_nodes.insert(n); } }); graph->for_each_edge([&](Edge* e) { if (!called_edges.count(e)) { unused_edges.insert(e); } }); // Destroy all the extra edges (in case they use extra nodes) for (auto* e : unused_edges) { graph->destroy_edge(e); } for (auto* n : unused_nodes) { graph->destroy_node(n); } } if (bluntify) { graph->bluntify(); } if (!path_name.empty()) { graph->keep_path(path_name); } if (!paths_to_retain.empty() || retain_complement) { graph->paths.keep_paths(paths_to_retain); } if (drop_paths) { graph->paths.clear(); } if (remove_orphans) { graph->remove_orphan_edges(); } if (unchop) { graph->unchop(); } if (simplify_graph) { graph->simplify_siblings(); } if (normalize_graph) { graph->normalize(); } if (until_normal_iter) { graph->normalize(until_normal_iter); } if (strong_connect) { graph->keep_multinode_strongly_connected_components(); } if (remove_non_path) { graph->remove_non_path(); } if (remove_path) { graph->remove_path(); } if (orient_forward) { algorithms::orient_nodes_forward(graph); } if (flip_doubly_reversed_edges) { graph->flip_doubly_reversed_edges(); } if (dagify_steps) { unordered_map<int64_t, pair<int64_t, bool> > node_translation; *graph = graph->dagify(dagify_steps, node_translation, 0, dagify_component_length_max); } if (dagify_to) { unordered_map<int64_t, pair<int64_t, bool> > node_translation; // use the walk as our maximum number of steps; it's the worst case *graph = graph->dagify(dagify_to, node_translation, dagify_to, dagify_component_length_max); } if (unfold_to) { unordered_map<int64_t, pair<int64_t, bool> > node_translation; *graph = graph->unfold(unfold_to, node_translation); } if (remove_null) { graph->remove_null_nodes_forwarding_edges(); } if (break_cycles) { graph->break_cycles(); } // to subset the graph if (!root_nodes.empty()) { VG g; for (auto root : root_nodes) { graph->nonoverlapping_node_context_without_paths(graph->get_node(root), g); graph->expand_context(g, max(context_steps, 1)); g.remove_orphan_edges(); } *graph = g; } if (!aln_file.empty()) { // read in the alignments and save their paths, concatenating them in order where they have the same name map<string, Path> paths_map; function<void(Alignment&)> lambda = [&graph, &paths_map](Alignment& aln) { Path path = simplify(aln.path()); path.set_name(aln.name()); auto f = paths_map.find(path.name()); if (f != paths_map.end()) { paths_map[path.name()] = concat_paths(f->second, path); } else { paths_map[path.name()] = path; } }; if (aln_file == "-") { vg::io::for_each(std::cin, lambda); } else { ifstream in; in.open(aln_file.c_str()); vg::io::for_each(in, lambda); } vector<Path> paths; for (auto& p : paths_map) { paths.push_back(p.second); } paths_map.clear(); if (!label_paths) { // execute the edits auto translation = graph->edit(paths, true); if (!translation_file.empty()) { ofstream out(translation_file); vg::io::write_buffered(out, translation, 0); out.close(); } } else { // just add the path labels to the graph graph->paths.extend(paths); } } if (!loci_file.empty()) { // read in the alignments and save their paths vector<Path> paths; function<void(Locus&)> lambda = [&graph, &paths, &called_genotypes_only](Locus& locus) { // if we are only doing called genotypes, record so we can filter alleles set<int> alleles_in_genotype; if (called_genotypes_only) { for (int i = 0; i < locus.genotype_size(); ++i) { for (int j = 0; j < locus.genotype(i).allele_size(); ++j) { alleles_in_genotype.insert(locus.genotype(i).allele(j)); } } } for (int i = 0; i < locus.allele_size(); ++i) { // skip alleles not in the genotype if using only called genotypes if (!alleles_in_genotype.empty()) { if (!alleles_in_genotype.count(i)) continue; } Path path = simplify(locus.allele(i)); stringstream name; name << locus.name() << ":" << i; path.set_name(name.str()); paths.push_back(path); } }; if (loci_file == "-") { vg::io::for_each(std::cin, lambda); } else { ifstream in; in.open(loci_file.c_str()); vg::io::for_each(in, lambda); } // execute the edits and produce the translation if requested. // Make sure to break at node ends, but don't add any paths because they're just loci alleles and not real paths. auto translation = graph->edit(paths, false, false, true); if (!translation_file.empty()) { ofstream out(translation_file); vg::io::write_buffered(out, translation, 0); out.close(); } } // and optionally compact ids if (compact_ids) { graph->sort(); graph->compact_ids(); } if (compact_ranks) { graph->paths.compact_ranks(); } if (prune_complex) { if (!(path_length > 0 && edge_max > 0)) { cerr << "[vg mod]: when pruning complex regions you must specify a --path-length and --edge-max" << endl; return 1; } graph->prune_complex_with_head_tail(path_length, edge_max); } if (max_degree) { algorithms::remove_high_degree_nodes(*graph, max_degree); } if (prune_subgraphs) { graph->prune_short_subgraphs(path_length); } if (chop_to) { graph->dice_nodes(chop_to); graph->paths.compact_ranks(); } if (kill_labels) { graph->for_each_node([](Node* n) { n->clear_sequence(); }); } if (add_start_and_end_markers) { if (!(path_length > 0)) { cerr << "[vg mod]: when adding start and end markers you must provide a --path-length" << endl; return 1; } // TODO: replace this with the SourceSinkOverlay somehow? Node* head_node = NULL; Node* tail_node = NULL; vg::id_t head_id = 0, tail_id = 0; graph->add_start_end_markers(path_length, '#', '$', head_node, tail_node, head_id, tail_id); } if (destroy_node_id > 0) { graph->destroy_node(destroy_node_id); } if (cactus) { // ensure we're sorted graph->sort(); *graph = cactusify(*graph); // no paths survive, make sure they are erased graph->paths = Paths(); } graph->serialize_to_ostream(std::cout); delete graph; return 0; }
CactusSiteFinder::CactusSiteFinder(VG& graph, const string& hint_path_name): graph(graph), hint_path_name(hint_path_name) { // Make sure the graph is sorted. // cactus needs the nodes to be sorted in order to find a source and sink. graph.sort(); }
void renderer::draw_objects(VG& v) { VG::iterator it; for (it = v.begin(); it != v.end(); ++it) { draw(*it); } }
int main_augment(int argc, char** argv) { // augmentation mode string augmentation_mode = "direct"; // load pileupes from here string pileup_file_name; // minimum support to consider adding a variant to the graph int min_aug_support = PileupAugmenter::Default_min_aug_support; // Should we expect a subgraph and ignore pileups for missing nodes/edges? bool expect_subgraph = false; // Write the translations (as protobuf) to this path string translation_file_name; // Include a path in the graph for each GAM bool include_paths = false; // Just label the paths with the GAM bool label_paths = false; // Merge alleles from this loci file instead of GAM string loci_filename; // Merge only alleles from called genotypes in the loci file bool called_genotypes_only = false; // Write the supports (as protobuf) to this path string support_file_name; // Load in GAM alignments to map over to the augmented graph from here string gam_in_file_name; // Write the GAM alignments (from gam_in_file_name) projected on the augmented graph here string gam_out_file_name; // Print some progress messages to screen bool show_progress = false; // Print verbose message bool verbose = false; // Number of threads to use (will default to all if not specified) int thread_count = 0; // Bases wit quality less than 10 will not be added to the pileup int min_quality = 10; // Bases with more than this many mismatches within the window_size not added int max_mismatches = 1; // Window size for above (0 effectively turns this check off) int window_size = 0; // Hack to prevent protobuf messages from getting too big by limiting depth at // any given position to max_depth int max_depth = 1000; // Combine MAPQ and PHRED base qualities to determine quality at each position // If false, only PHRED base quality will be used. bool use_mapq = true; static const struct option long_options[] = { // General Options {"augmentation-mode", required_argument, 0, 'a'}, {"translation", required_argument, 0, 'Z'}, {"alignment-out", required_argument, 0, 'A'}, {"include-paths", no_argument, 0, 'i'}, {"label-paths", no_argument, 0, 'B'}, {"help", no_argument, 0, 'h'}, {"progress", required_argument, 0, 'p'}, {"verbose", no_argument, 0, 'v'}, {"threads", required_argument, 0, 't'}, // Loci Options {"include-loci", required_argument, 0, 'l'}, {"include-gt", required_argument, 0, 'L'}, // Pileup Options {"pileup", required_argument, 0, 'P'}, {"support", required_argument, 0, 'S'}, {"min-quality", required_argument, 0, 'q'}, {"max-mismatches", required_argument, 0, 'm'}, {"window-size", required_argument, 0, 'w'}, {"ignore-mapq", no_argument, 0, 'M'}, {"min-aug-support", required_argument, 0, 'g'}, {"subgraph", no_argument, 0, 'U'}, {0, 0, 0, 0} }; static const char* short_options = "a:Z:A:iBhpvt:l:L:P:S:q:m:w:Mg:U"; optind = 2; // force optind past command positional arguments // This is our command-line parser ConfigurableParser parser(short_options, long_options, [&](int c) { // Parse all the options we have defined here. switch (c) { // General Options case 'a': augmentation_mode = optarg; break; case 'Z': translation_file_name = optarg; break; case 'A': gam_out_file_name = optarg; break; case 'i': include_paths = true; break; case 'B': label_paths = true; break; case 'h': case '?': /* getopt_long already printed an error message. */ help_augment(argv, parser); exit(1); break; case 'p': show_progress = true; break; case 'v': verbose = true; break; case 't': thread_count = parse<int>(optarg); break; // Loci Options case 'l': loci_filename = optarg; break; case 'L': loci_filename = optarg; called_genotypes_only = true; break; // Pileup Options case 'P': pileup_file_name = optarg; break; case 'S': support_file_name = optarg; break; case 'q': min_quality = parse<int>(optarg); break; case 'm': max_mismatches = parse<int>(optarg); break; case 'w': window_size = parse<int>(optarg); break; case 'M': use_mapq = false; break; case 'g': min_aug_support = parse<int>(optarg); break; case 'U': expect_subgraph = true; break; default: abort (); } }); // Parse the command line options, updating optind. parser.parse(argc, argv); if (thread_count != 0) { // Use a non-default number of threads omp_set_num_threads(thread_count); } thread_count = get_thread_count(); // Parse the two positional arguments if (optind + 1 > argc) { cerr << "[vg augment] error: too few arguments" << endl; help_augment(argv, parser); return 1; } string graph_file_name = get_input_file_name(optind, argc, argv); if (optind < argc) { gam_in_file_name = get_input_file_name(optind, argc, argv); } if (gam_in_file_name.empty() && loci_filename.empty()) { cerr << "[vg augment] error: gam file argument required" << endl; return 1; } if (gam_in_file_name == "-" && graph_file_name == "-") { cerr << "[vg augment] error: graph and gam can't both be from stdin." << endl; return 1; } if (gam_in_file_name == "-" && !gam_out_file_name.empty()) { cerr << "[vg augment] error: cannot stream input gam when using -A option (as it requires 2 passes)" << endl; return 1; } if (augmentation_mode != "pileup" && augmentation_mode != "direct") { cerr << "[vg augment] error: pileup and direct are currently the only supported augmentation modes (-a)" << endl; return 1; } if (augmentation_mode != "direct" and !gam_out_file_name.empty()) { cerr << "[vg augment] error: GAM output only works with \"direct\" augmentation mode" << endl; return 1; } if (augmentation_mode != "pileup" and (!support_file_name.empty() || !pileup_file_name.empty())) { cerr << "[vg augment] error: Pileup (-P) and Support (-S) output only work with \"pileup\" augmentation mode" << endl; return 1; } if (label_paths && (!gam_out_file_name.empty() || !translation_file_name.empty())) { cerr << "[vg augment] error: Translation (-Z) and GAM (-A) output do not work with \"label-only\" (-B) mode" << endl; return 1; } // read the graph if (show_progress) { cerr << "Reading input graph" << endl; } VG* graph; get_input_file(graph_file_name, [&](istream& in) { graph = new VG(in); }); Pileups* pileups = nullptr; if (!pileup_file_name.empty() || augmentation_mode == "pileup") { // We will need the computed pileups // compute the pileups from the graph and gam pileups = compute_pileups(graph, gam_in_file_name, thread_count, min_quality, max_mismatches, window_size, max_depth, use_mapq, show_progress); } if (!pileup_file_name.empty()) { // We want to write out pileups. if (show_progress) { cerr << "Writing pileups" << endl; } ofstream pileup_file(pileup_file_name); if (!pileup_file) { cerr << "[vg augment] error: unable to open output pileup file: " << pileup_file_name << endl; exit(1); } pileups->write(pileup_file); } if (augmentation_mode == "direct" && !gam_in_file_name.empty()) { // Augment with the reads if (!support_file_name.empty()) { cerr << "[vg augment] error: support calculation in direct augmentation mode is unimplemented" << endl; exit(1); } // We don't need any pileups if (pileups != nullptr) { delete pileups; pileups = nullptr; } // Load all the reads vector<Alignment> reads; // And pull out their paths vector<Path> read_paths; if (include_paths) { // verbatim from vg mod -i map<string, Path> paths_map; function<void(Alignment&)> lambda = [&](Alignment& aln) { Path path = simplify(aln.path()); path.set_name(aln.name()); auto f = paths_map.find(path.name()); if (f != paths_map.end()) { paths_map[path.name()] = concat_paths(f->second, path); } else { paths_map[path.name()] = path; } if (!gam_out_file_name.empty()) { reads.push_back(aln); } }; if (gam_in_file_name == "-") { stream::for_each(std::cin, lambda); } else { ifstream in; in.open(gam_in_file_name.c_str()); stream::for_each(in, lambda); } for (auto& p : paths_map) { read_paths.push_back(p.second); } paths_map.clear(); } else { get_input_file(gam_in_file_name, [&](istream& alignment_stream) { stream::for_each<Alignment>(alignment_stream, [&](Alignment& alignment) { // Trim the softclips off of every read // Work out were to cut int cut_start = softclip_start(alignment); int cut_end = softclip_end(alignment); // Cut the sequence and quality alignment.set_sequence(alignment.sequence().substr(cut_start, alignment.sequence().size() - cut_start - cut_end)); if (alignment.quality().size() != 0) { alignment.set_quality(alignment.quality().substr(cut_start, alignment.quality().size() - cut_start - cut_end)); } // Trim the path *alignment.mutable_path() = trim_hanging_ends(alignment.path()); // Save every read if (!gam_out_file_name.empty()) { reads.push_back(alignment); } // And the path for the read, separately // TODO: Make edit use callbacks or something so it doesn't need a vector of paths necessarily read_paths.push_back(alignment.path()); }); }); } // Augment the graph, rewriting the paths. vector<Translation> translation; if (!label_paths) { translation = graph->edit(read_paths, include_paths, !gam_out_file_name.empty(), false); } else { // just add the path labels to the graph graph->paths.extend(read_paths); } // Write the augmented graph if (show_progress) { cerr << "Writing augmented graph" << endl; } graph->serialize_to_ostream(cout); if (!translation_file_name.empty()) { // Write the translations if (show_progress) { cerr << "Writing translation table" << endl; } ofstream translation_file(translation_file_name); if (!translation_file) { cerr << "[vg augment]: Error opening translation file: " << translation_file_name << endl; return 1; } stream::write_buffered(translation_file, translation, 0); translation_file.close(); } if (!gam_out_file_name.empty() && reads.size() == read_paths.size()) { // Write out the modified GAM ofstream gam_out_file(gam_out_file_name); if (!gam_out_file) { cerr << "[vg augment]: Error opening output GAM file: " << gam_out_file_name << endl; return 1; } // We use this buffer and do a buffered write vector<Alignment> gam_buffer; for (size_t i = 0; i < reads.size(); i++) { // Say we are going to write out the alignment gam_buffer.push_back(reads[i]); // Set its path to the corrected embedded path *gam_buffer.back().mutable_path() = read_paths[i]; // Write it back out stream::write_buffered(gam_out_file, gam_buffer, 100); } // Flush the buffer stream::write_buffered(gam_out_file, gam_buffer, 0); } } else if (augmentation_mode == "pileup") { // We want to augment with pileups // The PileupAugmenter object will take care of all augmentation PileupAugmenter augmenter(graph, PileupAugmenter::Default_default_quality, min_aug_support); // compute the augmented graph from the pileup // Note: we can save a fair bit of memory by clearing pileups, and re-reading off of // pileup_file_name augment_with_pileups(augmenter, *pileups, expect_subgraph, show_progress); delete pileups; pileups = nullptr; // write the augmented graph if (show_progress) { cerr << "Writing augmented graph" << endl; } augmenter.write_augmented_graph(cout, false); // write the agumented gam if (!gam_out_file_name.empty()) { ofstream gam_out_file(gam_out_file_name); if (!gam_out_file) { cerr << "[vg augment]: Error opening output GAM file: " << gam_out_file_name << endl; return 1; } get_input_file(gam_in_file_name, [&](istream& alignment_stream) { vector<Alignment> gam_buffer; function<void(Alignment&)> lambda = [&gam_out_file, &gam_buffer, &augmenter](Alignment& alignment) { list<mapping_t> aug_path; augmenter.map_path(alignment.path(), aug_path, true); alignment.mutable_path()->clear_mapping(); for (auto& aug_mapping : aug_path) { *alignment.mutable_path()->add_mapping() = aug_mapping.to_mapping(); } gam_buffer.push_back(alignment); stream::write_buffered(gam_out_file, gam_buffer, 100); }; stream::for_each(alignment_stream, lambda); stream::write_buffered(gam_out_file, gam_buffer, 0); }); } // write the translation if (!translation_file_name.empty()) { // write the translations if (show_progress) { cerr << "Writing translation table" << endl; } ofstream translation_file(translation_file_name); if (!translation_file) { cerr << "[vg augment] error: error opening translation file: " << translation_file_name << endl; return 1; } augmenter._augmented_graph.write_translations(translation_file); translation_file.close(); } // write the supports if (!support_file_name.empty()) { // write the supports if (show_progress) { cerr << "Writing supports" << endl; } ofstream support_file(support_file_name); if (!support_file) { cerr << "[vg augment] error: error opening supports file: " << support_file_name << endl; return 1; } augmenter._augmented_graph.write_supports(support_file); support_file.close(); } } else if (!loci_filename.empty()) { // Open the file ifstream loci_file(loci_filename); assert(loci_file.is_open()); // What nodes and edges are called as present by the loci? set<Node*> called_nodes; set<Edge*> called_edges; function<void(Locus&)> lambda = [&](Locus& locus) { // For each locus if (locus.genotype_size() == 0) { // No call made here. Just remove all the nodes/edges. TODO: // should we keep them all if we don't know if they're there or // not? Or should the caller call ref with some low confidence? return; } const Genotype& gt = locus.genotype(0); for (size_t j = 0; j < gt.allele_size(); j++) { // For every allele called as present int allele_number = gt.allele(j); const Path& allele = locus.allele(allele_number); for (size_t i = 0; i < allele.mapping_size(); i++) { // For every Mapping in the allele const Mapping& m = allele.mapping(i); // Remember to keep this node called_nodes.insert(graph->get_node(m.position().node_id())); if (i + 1 < allele.mapping_size()) { // Look at the next mapping, which exists const Mapping& m2 = allele.mapping(i + 1); // Find the edge from the last Mapping's node to this one and mark it as used called_edges.insert(graph->get_edge(NodeSide(m.position().node_id(), !m.position().is_reverse()), NodeSide(m2.position().node_id(), m2.position().is_reverse()))); } } } }; stream::for_each(loci_file, lambda); // Collect all the unused nodes and edges (so we don't try to delete // while iterating...) set<Node*> unused_nodes; set<Edge*> unused_edges; graph->for_each_node([&](Node* n) { if (!called_nodes.count(n)) { unused_nodes.insert(n); } }); graph->for_each_edge([&](Edge* e) { if (!called_edges.count(e)) { unused_edges.insert(e); } }); // Destroy all the extra edges (in case they use extra nodes) for (auto* e : unused_edges) { graph->destroy_edge(e); } for (auto* n : unused_nodes) { graph->destroy_node(n); } } if (pileups != nullptr) { delete pileups; pileups = nullptr; } delete graph; return 0; }