void Caller::create_snp_path(int64_t snp_node, bool secondary_snp) { // for now we don't write secdonary snp, so we have 1 path per *site* // and counting paths will give us somethign comparable to snp count // from bcftools if (!secondary_snp) { stringstream name; name << "SNP_" << snp_node; Mapping mapping; Position* pos = mapping.mutable_position(); // make path that covers node forward with no edits. not super // useful but will use to count snps... pos->set_node_id(snp_node); pos->set_offset(0); mapping.mutable_position()->set_is_reverse(false); // note: create_path doesn't seem to work.. too rushed to look into //list<Mapping>& mappings = _call_graph.paths.create_path(name.str()); list<Mapping> mappings; mappings.push_back(mapping); _call_graph.paths._paths.insert(make_pair(name.str(), mappings)); } }
// generates a perfect alignment from the graph Alignment Sampler::alignment(size_t length) { string seq; Alignment aln; Path* path = aln.mutable_path(); pos_t pos = position(); char c = pos_char(pos); // we do something wildly inefficient but conceptually clean // for each position in the mapping we add a mapping // at the end we will simplify the alignment, merging redundant mappings do { // add in the char for the current position seq += c; Mapping* mapping = path->add_mapping(); *mapping->mutable_position() = make_position(pos); Edit* edit = mapping->add_edit(); edit->set_from_length(1); edit->set_to_length(1); // decide the next position auto nextc = next_pos_chars(pos); // no new positions mean we are done; we've reached the end of the graph if (nextc.empty()) break; // what positions do we go to next? vector<pos_t> nextp; for (auto& n : nextc) nextp.push_back(n.first); // pick one at random uniform_int_distribution<int> next_dist(0, nextc.size()-1); // update our position pos = nextp.at(next_dist(rng)); // update our char c = nextc[pos]; } while (seq.size() < length); // save our sequence in the alignment aln.set_sequence(seq); aln = simplify(aln); { // name the alignment string data; aln.SerializeToString(&data); int n; #pragma omp critical(nonce) n = nonce++; data += std::to_string(n); const string hash = sha1head(data, 16); aln.set_name(hash); } // and simplify it aln.set_identity(identity(aln.path())); return aln; }
int main_find(int argc, char** argv) { if (argc == 2) { help_find(argv); return 1; } string db_name; string sequence; int kmer_size=0; int kmer_stride = 1; vector<string> kmers; vector<vg::id_t> node_ids; string node_list_file; int context_size=0; bool use_length = false; bool count_kmers = false; bool kmer_table = false; vector<string> targets; string path_name; bool position_in = false; bool rank_in = false; string range; string gcsa_in; string xg_name; bool get_mems = false; int mem_reseed_length = 0; bool use_fast_reseed = true; bool get_alignments = false; bool get_mappings = false; string node_id_range; string aln_on_id_range; vg::id_t start_id = 0; vg::id_t end_id = 0; bool pairwise_distance = false; string haplotype_alignments; string gam_file; int max_mem_length = 0; int min_mem_length = 1; string to_graph_file; bool extract_threads = false; vector<string> extract_patterns; vg::id_t approx_id = 0; int c; optind = 2; // force optind past command positional argument while (true) { static struct option long_options[] = { //{"verbose", no_argument, &verbose_flag, 1}, {"db-name", required_argument, 0, 'd'}, {"xg-name", required_argument, 0, 'x'}, {"gcsa", required_argument, 0, 'g'}, {"node", required_argument, 0, 'n'}, {"node-list", required_argument, 0, 'N'}, {"edges-end", required_argument, 0, 'e'}, {"edges-start", required_argument, 0, 's'}, {"kmer", required_argument, 0, 'k'}, {"table", no_argument, 0, 'T'}, {"sequence", required_argument, 0, 'S'}, {"mems", required_argument, 0, 'M'}, {"reseed-length", required_argument, 0, 'B'}, {"fast-reseed", no_argument, 0, 'f'}, {"kmer-stride", required_argument, 0, 'j'}, {"kmer-size", required_argument, 0, 'z'}, {"context", required_argument, 0, 'c'}, {"use-length", no_argument, 0, 'L'}, {"kmer-count", no_argument, 0, 'C'}, {"path", required_argument, 0, 'p'}, {"position-in", required_argument, 0, 'P'}, {"rank-in", required_argument, 0, 'R'}, {"node-range", required_argument, 0, 'r'}, {"alignments", no_argument, 0, 'a'}, {"mappings", no_argument, 0, 'm'}, {"alns-in", required_argument, 0, 'i'}, {"alns-on", required_argument, 0, 'o'}, {"distance", no_argument, 0, 'D'}, {"haplotypes", required_argument, 0, 'H'}, {"gam", required_argument, 0, 'G'}, {"to-graph", required_argument, 0, 'A'}, {"max-mem", required_argument, 0, 'Y'}, {"min-mem", required_argument, 0, 'Z'}, {"extract-threads", no_argument, 0, 't'}, {"threads-named", required_argument, 0, 'q'}, {"approx-pos", required_argument, 0, 'X'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "d:x:n:e:s:o:k:hc:LS:z:j:CTp:P:r:amg:M:R:B:fi:DH:G:N:A:Y:Z:tq:X:", long_options, &option_index); // Detect the end of the options. if (c == -1) break; switch (c) { case 'd': db_name = optarg; break; case 'x': xg_name = optarg; break; case 'g': gcsa_in = optarg; break; case 'k': kmers.push_back(optarg); break; case 'S': sequence = optarg; break; case 'M': sequence = optarg; get_mems = true; break; case 'B': mem_reseed_length = atoi(optarg); break; case 'f': use_fast_reseed = true; break; case 'Y': max_mem_length = atoi(optarg); break; case 'Z': min_mem_length = atoi(optarg); break; case 'j': kmer_stride = atoi(optarg); break; case 'z': kmer_size = atoi(optarg); break; case 'C': count_kmers = true; break; case 'p': targets.push_back(optarg); break; case 'P': path_name = optarg; position_in = true; break; case 'R': path_name = optarg; rank_in = true; break; case 'c': context_size = atoi(optarg); break; case 'L': use_length = true; break; case 'n': node_ids.push_back(atoi(optarg)); break; case 'N': node_list_file = optarg; break; case 'e': end_id = atoi(optarg); break; case 's': start_id = atoi(optarg); break; case 'T': kmer_table = true; break; case 'r': range = optarg; break; case 'a': get_alignments = true; break; case 'i': node_id_range = optarg; break; case 'm': get_mappings = true; break; case 'o': aln_on_id_range = optarg; break; case 'D': pairwise_distance = true; break; case 'H': haplotype_alignments = optarg; break; case 't': extract_threads = true; break; case 'q': extract_threads = true; extract_patterns.push_back(optarg); break; case 'X': approx_id = atoi(optarg); break; case 'G': gam_file = optarg; break; case 'A': to_graph_file = optarg; break; case 'h': case '?': help_find(argv); exit(1); break; default: abort (); } } if (optind < argc) { cerr << "[vg find] find does not accept positional arguments" << endl; return 1; } if (db_name.empty() && gcsa_in.empty() && xg_name.empty()) { cerr << "[vg find] find requires -d, -g, or -x to know where to find its database" << endl; return 1; } if (context_size > 0 && use_length == true && xg_name.empty()) { cerr << "[vg find] error, -L not supported without -x" << endl; exit(1); } if (xg_name.empty() && mem_reseed_length) { cerr << "error:[vg find] SMEM reseeding requires an XG index. Provide XG index with -x." << endl; exit(1); } // process input node list if (!node_list_file.empty()) { ifstream nli; nli.open(node_list_file); if (!nli.good()){ cerr << "[vg find] error, unable to open the node list input file." << endl; exit(1); } string line; while (getline(nli, line)){ for (auto& idstr : split_delims(line, " \t")) { node_ids.push_back(atol(idstr.c_str())); } } nli.close(); } // open index Index* vindex = nullptr; if (db_name.empty()) { assert(!gcsa_in.empty() || !xg_name.empty()); } else { vindex = new Index; vindex->open_read_only(db_name); } xg::XG xindex; if (!xg_name.empty()) { ifstream in(xg_name.c_str()); xindex.load(in); } if (get_alignments) { assert(!db_name.empty()); vector<Alignment> output_buf; auto lambda = [&output_buf](const Alignment& aln) { output_buf.push_back(aln); stream::write_buffered(cout, output_buf, 100); }; vindex->for_each_alignment(lambda); stream::write_buffered(cout, output_buf, 0); } if (!node_id_range.empty()) { assert(!db_name.empty()); vector<string> parts = split_delims(node_id_range, ":"); if (parts.size() == 1) { convert(parts.front(), start_id); end_id = start_id; } else { convert(parts.front(), start_id); convert(parts.back(), end_id); } vector<Alignment> output_buf; auto lambda = [&output_buf](const Alignment& aln) { output_buf.push_back(aln); stream::write_buffered(cout, output_buf, 100); }; vindex->for_alignment_in_range(start_id, end_id, lambda); stream::write_buffered(cout, output_buf, 0); } if (!aln_on_id_range.empty()) { assert(!db_name.empty()); vector<string> parts = split_delims(aln_on_id_range, ":"); if (parts.size() == 1) { convert(parts.front(), start_id); end_id = start_id; } else { convert(parts.front(), start_id); convert(parts.back(), end_id); } vector<vg::id_t> ids; for (auto i = start_id; i <= end_id; ++i) { ids.push_back(i); } vector<Alignment> output_buf; auto lambda = [&output_buf](const Alignment& aln) { output_buf.push_back(aln); stream::write_buffered(cout, output_buf, 100); }; vindex->for_alignment_to_nodes(ids, lambda); stream::write_buffered(cout, output_buf, 0); } if (!to_graph_file.empty()) { assert(vindex != nullptr); ifstream tgi(to_graph_file); VG graph(tgi); vector<vg::id_t> ids; graph.for_each_node([&](Node* n) { ids.push_back(n->id()); }); vector<Alignment> output_buf; auto lambda = [&output_buf](const Alignment& aln) { output_buf.push_back(aln); stream::write_buffered(cout, output_buf, 100); }; vindex->for_alignment_to_nodes(ids, lambda); stream::write_buffered(cout, output_buf, 0); } if (!xg_name.empty()) { if (!node_ids.empty() && path_name.empty() && !pairwise_distance) { // get the context of the node vector<Graph> graphs; set<vg::id_t> ids; for (auto node_id : node_ids) ids.insert(node_id); for (auto node_id : node_ids) { Graph g; xindex.neighborhood(node_id, context_size, g, !use_length); if (context_size == 0) { for (auto& edge : xindex.edges_of(node_id)) { // if both ends of the edge are in our targets, keep them if (ids.count(edge.to()) && ids.count(edge.from())) { *g.add_edge() = edge; } } } graphs.push_back(g); } VG result_graph; for (auto& graph : graphs) { // Allow duplicate nodes and edges (from e.g. multiple -n options); silently collapse them. result_graph.extend(graph); } result_graph.remove_orphan_edges(); // Order the mappings by rank. TODO: how do we handle breaks between // different sections of a path with a single name? result_graph.paths.sort_by_mapping_rank(); // return it result_graph.serialize_to_ostream(cout); } else if (end_id != 0) { for (auto& e : xindex.edges_on_end(end_id)) { cout << (e.from_start() ? -1 : 1) * e.from() << "\t" << (e.to_end() ? -1 : 1) * e.to() << endl; } } else if (start_id != 0) { for (auto& e : xindex.edges_on_start(start_id)) { cout << (e.from_start() ? -1 : 1) * e.from() << "\t" << (e.to_end() ? -1 : 1) * e.to() << endl; } } if (!node_ids.empty() && !path_name.empty() && !pairwise_distance && (position_in || rank_in)) { // Go get the positions of these nodes in this path if (xindex.path_rank(path_name) == 0) { // This path doesn't exist, and we'll get a segfault or worse if // we go look for positions in it. cerr << "[vg find] error, path \"" << path_name << "\" not found in index" << endl; exit(1); } // Note: this isn't at all consistent with -P option with rocksdb, which couts a range // and then mapping, but need this info right now for scripts/chunked_call for (auto node_id : node_ids) { cout << node_id; for (auto r : (position_in ? xindex.position_in_path(node_id, path_name) : xindex.node_ranks_in_path(node_id, path_name))) { cout << "\t" << r; } cout << endl; } } if (pairwise_distance) { if (node_ids.size() != 2) { cerr << "[vg find] error, exactly 2 nodes (-n) required with -D" << endl; exit(1); } cout << xindex.min_approx_path_distance(node_ids[0], node_ids[1]) << endl; return 0; } if (approx_id != 0) { cout << xindex.node_start(approx_id) << endl; return 0; } if (!targets.empty()) { Graph graph; for (auto& target : targets) { // Grab each target region string name; int64_t start, end; xg::parse_region(target, name, start, end); if(xindex.path_rank(name) == 0) { // Passing a nonexistent path to get_path_range produces Undefined Behavior cerr << "[vg find] error, path " << name << " not found in index" << endl; exit(1); } // no coordinates given, we do whole thing (0,-1) if (start < 0 && end < 0) { start = 0; } xindex.get_path_range(name, start, end, graph); } if (context_size > 0) { xindex.expand_context(graph, context_size, true, !use_length); } VG vgg; vgg.extend(graph); // removes dupes // Order the mappings by rank. TODO: how do we handle breaks between // different sections of a path with a single name? vgg.paths.sort_by_mapping_rank(); vgg.serialize_to_ostream(cout); } if (!range.empty()) { Graph graph; int64_t id_start=0, id_end=0; vector<string> parts = split_delims(range, ":"); if (parts.size() == 1) { cerr << "[vg find] error, format of range must be \"N:M\" where start id is N and end id is M, got " << range << endl; exit(1); } convert(parts.front(), id_start); convert(parts.back(), id_end); if (!use_length) { xindex.get_id_range(id_start, id_end, graph); } else { // treat id_end as length instead. xindex.get_id_range_by_length(id_start, id_end, graph, true); } if (context_size > 0) { xindex.expand_context(graph, context_size, true, !use_length); } VG vgg; vgg.extend(graph); // removes dupes vgg.remove_orphan_edges(); vgg.serialize_to_ostream(cout); } if(!haplotype_alignments.empty()) { // What should we do with each alignment? function<void(Alignment&)> lambda = [&xindex](Alignment& aln) { // Count the amtches to the path. The path might be empty, in // which case it will yield the biggest size_t you can have. size_t matches = xindex.count_matches(aln.path()); // We do this single-threaded, at least for now, so we don't // need to worry about coordinating output, and we can just // spit out the counts as bare numbers. cout << matches << endl; }; if (haplotype_alignments == "-") { stream::for_each(std::cin, lambda); } else { ifstream in; in.open(haplotype_alignments.c_str()); if(!in.is_open()) { cerr << "[vg find] error: could not open alignments file " << haplotype_alignments << endl; exit(1); } stream::for_each(in, lambda); } } if (extract_threads) { size_t thread_number = 0; bool extract_reverse = false; map<string, list<xg::XG::thread_t> > threads; if (extract_patterns.empty()) { threads = xindex.extract_threads(extract_reverse); } else { for (auto& pattern : extract_patterns) { for (auto& t : xindex.extract_threads_matching(pattern, extract_reverse)) { threads[t.first] = t.second; } } } for(auto t : threads) { // Convert to a Path auto& thread = *t.second.begin(); auto& thread_name = t.first; Path path; for(xg::XG::ThreadMapping& m : thread) { // Convert all the mappings Mapping mapping; mapping.mutable_position()->set_node_id(m.node_id); mapping.mutable_position()->set_is_reverse(m.is_reverse); *(path.add_mapping()) = mapping; } // Get each thread's name path.set_name(thread_name); // Give each thread a name //path.set_name("_thread_" + to_string(thread_number++)); // We need a Graph for serialization purposes. We do one chunk per // thread in case the threads are long. Graph g; *(g.add_path()) = path; // Dump the graph with its mappings. TODO: can we restrict these to vector<Graph> gb = { g }; stream::write_buffered(cout, gb, 0); } } if (!gam_file.empty()) { set<vg::id_t> nodes; function<void(Alignment&)> lambda = [&nodes](Alignment& aln) { // accumulate nodes matched by the path auto& path = aln.path(); for (int i = 0; i < path.mapping_size(); ++i) { nodes.insert(path.mapping(i).position().node_id()); } }; if (gam_file == "-") { stream::for_each(std::cin, lambda); } else { ifstream in; in.open(gam_file.c_str()); if(!in.is_open()) { cerr << "[vg find] error: could not open alignments file " << gam_file << endl; exit(1); } stream::for_each(in, lambda); } // now we have the nodes to get Graph graph; for (auto& node : nodes) { *graph.add_node() = xindex.node(node); } xindex.expand_context(graph, max(1, context_size), true); // get connected edges VG vgg; vgg.extend(graph); vgg.serialize_to_ostream(cout); } } else if (!db_name.empty()) { if (!node_ids.empty() && path_name.empty()) { // get the context of the node vector<VG> graphs; for (auto node_id : node_ids) { VG g; vindex->get_context(node_id, g); if (context_size > 0) { vindex->expand_context(g, context_size); } graphs.push_back(g); } VG result_graph; for (auto& graph : graphs) { // Allow duplicate nodes and edges (from e.g. multiple -n options); silently collapse them. result_graph.extend(graph); } result_graph.remove_orphan_edges(); // return it result_graph.serialize_to_ostream(cout); } else if (end_id != 0) { vector<Edge> edges; vindex->get_edges_on_end(end_id, edges); for (vector<Edge>::iterator e = edges.begin(); e != edges.end(); ++e) { cout << (e->from_start() ? -1 : 1) * e->from() << "\t" << (e->to_end() ? -1 : 1) * e->to() << endl; } } else if (start_id != 0) { vector<Edge> edges; vindex->get_edges_on_start(start_id, edges); for (vector<Edge>::iterator e = edges.begin(); e != edges.end(); ++e) { cout << (e->from_start() ? -1 : 1) * e->from() << "\t" << (e->to_end() ? -1 : 1) * e->to() << endl; } } if (!node_ids.empty() && !path_name.empty()) { int64_t path_id = vindex->get_path_id(path_name); for (auto node_id : node_ids) { list<pair<int64_t, bool>> path_prev, path_next; int64_t prev_pos=0, next_pos=0; bool prev_backward, next_backward; if (vindex->get_node_path_relative_position(node_id, false, path_id, path_prev, prev_pos, prev_backward, path_next, next_pos, next_backward)) { // Negate IDs for backward nodes cout << node_id << "\t" << path_prev.front().first * (path_prev.front().second ? -1 : 1) << "\t" << prev_pos << "\t" << path_next.back().first * (path_next.back().second ? -1 : 1) << "\t" << next_pos << "\t"; Mapping m = vindex->path_relative_mapping(node_id, false, path_id, path_prev, prev_pos, prev_backward, path_next, next_pos, next_backward); cout << pb2json(m) << endl; } } } if (!targets.empty()) { VG graph; for (auto& target : targets) { string name; int64_t start, end; xg::parse_region(target, name, start, end); // end coordinate is exclusive for get_path() if (end >= 0) { ++end; } vindex->get_path(graph, name, start, end); } if (context_size > 0) { vindex->expand_context(graph, context_size); } graph.remove_orphan_edges(); graph.serialize_to_ostream(cout); } if (!range.empty()) { VG graph; int64_t id_start=0, id_end=0; vector<string> parts = split_delims(range, ":"); if (parts.size() == 1) { cerr << "[vg find] error, format of range must be \"N:M\" where start id is N and end id is M, got " << range << endl; exit(1); } convert(parts.front(), id_start); convert(parts.back(), id_end); vindex->get_range(id_start, id_end, graph); if (context_size > 0) { vindex->expand_context(graph, context_size); } graph.remove_orphan_edges(); graph.serialize_to_ostream(cout); } } // todo cleanup if/else logic to allow only one function if (!sequence.empty()) { if (gcsa_in.empty()) { if (get_mems) { cerr << "error:[vg find] a GCSA index must be passed to get MEMs" << endl; return 1; } set<int> kmer_sizes = vindex->stored_kmer_sizes(); if (kmer_sizes.empty()) { cerr << "error:[vg find] index does not include kmers, add with vg index -k" << endl; return 1; } if (kmer_size == 0) { kmer_size = *kmer_sizes.begin(); } for (int i = 0; i <= sequence.size()-kmer_size; i+=kmer_stride) { kmers.push_back(sequence.substr(i,kmer_size)); } } else { // let's use the GCSA index // Configure GCSA2 verbosity so it doesn't spit out loads of extra info gcsa::Verbosity::set(gcsa::Verbosity::SILENT); // Configure its temp directory to the system temp directory gcsa::TempFile::setDirectory(find_temp_dir()); // Open it ifstream in_gcsa(gcsa_in.c_str()); gcsa::GCSA gcsa_index; gcsa_index.load(in_gcsa); gcsa::LCPArray lcp_index; // default LCP is the gcsa base name +.lcp string lcp_in = gcsa_in + ".lcp"; ifstream in_lcp(lcp_in.c_str()); lcp_index.load(in_lcp); //range_type find(const char* pattern, size_type length) const; //void locate(size_type path, std::vector<node_type>& results, bool append = false, bool sort = true) const; //locate(i, results); if (!get_mems) { auto paths = gcsa_index.find(sequence.c_str(), sequence.length()); //cerr << paths.first << " - " << paths.second << endl; for (gcsa::size_type i = paths.first; i <= paths.second; ++i) { std::vector<gcsa::node_type> ids; gcsa_index.locate(i, ids); for (auto id : ids) { cout << gcsa::Node::decode(id) << endl; } } } else { // for mems we need to load up the gcsa and lcp structures into the mapper Mapper mapper(&xindex, &gcsa_index, &lcp_index); mapper.fast_reseed = use_fast_reseed; // get the mems double lcp_max, fraction_filtered; auto mems = mapper.find_mems_deep(sequence.begin(), sequence.end(), lcp_max, fraction_filtered, max_mem_length, min_mem_length, mem_reseed_length); // dump them to stdout cout << mems_to_json(mems) << endl; } } } if (!kmers.empty()) { if (count_kmers) { for (auto& kmer : kmers) { cout << kmer << "\t" << vindex->approx_size_of_kmer_matches(kmer) << endl; } } else if (kmer_table) { for (auto& kmer : kmers) { map<string, vector<pair<int64_t, int32_t> > > positions; vindex->get_kmer_positions(kmer, positions); for (auto& k : positions) { for (auto& p : k.second) { cout << k.first << "\t" << p.first << "\t" << p.second << endl; } } } } else { vector<VG> graphs; for (auto& kmer : kmers) { VG g; vindex->get_kmer_subgraph(kmer, g); if (context_size > 0) { vindex->expand_context(g, context_size); } graphs.push_back(g); } VG result_graph; for (auto& graph : graphs) { // Allow duplicate nodes and edges (from multiple kmers); silently collapse them. result_graph.extend(graph); } result_graph.remove_orphan_edges(); result_graph.serialize_to_ostream(cout); } } if (vindex) delete vindex; return 0; }
int main_xg(int argc, char** argv) { if (argc == 2) { help_xg(argv); return 1; } string vg_in; string vg_out; string out_name; string in_name; int64_t node_id; bool edges_from = false; bool edges_to = false; bool edges_of = false; bool edges_on_start = false; bool edges_on_end = false; bool node_sequence = false; string pos_for_char; string pos_for_substr; int context_steps = 0; bool node_context = false; string target; bool print_graph = false; bool text_output = false; bool validate_graph = false; bool extract_threads = false; bool store_threads = false; bool is_sorted_dag = false; string report_name; string b_array_name; int c; optind = 2; // force optind past "xg" positional argument while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"vg", required_argument, 0, 'v'}, {"out", required_argument, 0, 'o'}, {"in", required_argument, 0, 'i'}, {"extract-vg", required_argument, 0, 'X'}, {"node", required_argument, 0, 'n'}, {"char", required_argument, 0, 'P'}, {"substr", required_argument, 0, 'F'}, //{"range", required_argument, 0, 'r'}, {"context", required_argument, 0, 'c'}, {"edges-from", required_argument, 0, 'f'}, {"edges-to", required_argument, 0, 't'}, {"edges-of", required_argument, 0, 'O'}, {"edges-on-start", required_argument, 0, 'S'}, {"edges-on-end", required_argument, 0, 'E'}, {"node-seq", required_argument, 0, 's'}, {"path", required_argument, 0, 'p'}, {"extract-threads", no_argument, 0, 'x'}, {"store-threads", no_argument, 0, 'r'}, {"is-sorted-dag", no_argument, 0, 'd'}, {"report", required_argument, 0, 'R'}, {"debug", no_argument, 0, 'D'}, {"text-output", no_argument, 0, 'T'}, {"validate", no_argument, 0, 'V'}, {"dump-bs", required_argument, 0, 'b'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hv:o:i:X:f:t:s:c:n:p:DxrdTO:S:E:VR:P:F:b:", long_options, &option_index); // Detect the end of the options. if (c == -1) break; switch (c) { case 'v': vg_in = optarg; break; case 'V': validate_graph = true; break; case 'o': out_name = optarg; break; case 'D': print_graph = true; break; case 'T': text_output = true; break; case 'x': extract_threads = true; break; case 'r': store_threads = true; break; case 'd': is_sorted_dag = true; break; case 'i': in_name = optarg; break; case 'X': vg_out = optarg; break; case 'n': node_id = parse<int64_t>(optarg); node_context = true; break; case 'c': context_steps = parse<int>(optarg); break; case 'f': node_id = parse<int64_t>(optarg); edges_from = true; break; case 't': node_id = parse<int64_t>(optarg); edges_to = true; break; case 'O': node_id = parse<int64_t>(optarg); edges_of = true; break; case 'S': node_id = parse<int64_t>(optarg); edges_on_start = true; break; case 'E': node_id = parse<int64_t>(optarg); edges_on_end = true; break; case 's': node_id = parse<int64_t>(optarg); node_sequence = true; break; case 'p': target = optarg; break; case 'P': pos_for_char = optarg; break; case 'F': pos_for_substr = optarg; break; case 'R': report_name = optarg; break; case 'b': b_array_name = optarg; break; case 'h': case '?': help_xg(argv); exit(1); break; default: abort (); } } unique_ptr<XG> graph; //string file_name = argv[optind]; if (in_name.empty()) assert(!vg_in.empty()); if (vg_in == "-") { // Read VG from stdin graph = unique_ptr<XG>(new XG()); graph->from_stream(std::cin, validate_graph, print_graph, store_threads, is_sorted_dag); } else if (vg_in.size()) { // Read VG from a file ifstream in; in.open(vg_in.c_str()); graph = unique_ptr<XG>(new XG()); graph->from_stream(in, validate_graph, print_graph, store_threads, is_sorted_dag); } if (in_name.size()) { get_input_file(in_name, [&](istream& in) { // Load from an XG file or - (stdin) graph = stream::VPKG::load_one<XG>(in); }); } // Prepare structure tree for serialization unique_ptr<sdsl::structure_tree_node> structure; if (!report_name.empty()) { // We need to make a report, so we need the structure. Make a real tree // node. The unique_ptr handles deleting. structure = unique_ptr<sdsl::structure_tree_node>(new sdsl::structure_tree_node("name", "type")); } if(!vg_out.empty()) { if (graph.get() == nullptr) { cerr << "error [vg xg] no xg graph exists to convert; Try: vg xg -i graph.xg -X graph.vg" << endl; return 1; } VG converted; // Convert the xg graph to vg format convert_handle_graph(graph.get(), &converted); // TODO: The converter doesn't copy circular paths yet. // When it does, we can remove all this path copying code. // Make a raw Proto Graph to hold Path objects Graph path_graph; // Since paths are not copied, copy the paths. for (size_t rank = 1; rank <= graph->max_path_rank(); rank++) { // Extract each path into the path graph *path_graph.add_path() = graph->path(graph->path_name(rank)); } // Merge in all the paths converted.extend(path_graph); if (vg_out == "-") { converted.serialize_to_ostream(std::cout); } else { converted.serialize_to_file(vg_out); } } if (!out_name.empty()) { // Open a destination file if it is a file we want to write to ofstream out_file; if (out_name != "-") { out_file.open(out_name); } // Work out where to save to ostream& out = (out_name == "-") ? std::cout : out_file; // Encapsulate output in VPKG stream::VPKG::with_save_stream(out, "XG", [&](ostream& tagged) { // Serialize to the file while recording space usage to the structure. graph->serialize(tagged, structure.get(), "xg"); }); out.flush(); } if (!report_name.empty()) { // Save the report ofstream out; out.open(report_name.c_str()); sdsl::write_structure_tree<HTML_FORMAT>(structure.get(), out, 0); } // queries if (node_sequence) { cout << node_id << ": " << graph->node_sequence(node_id) << endl; } if (!pos_for_char.empty()) { // extract the position from the string int64_t id; bool is_rev; size_t off; extract_pos(pos_for_char, id, is_rev, off); // then pick it up from the graph cout << graph->pos_char(id, is_rev, off) << endl; } if (!pos_for_substr.empty()) { int64_t id; bool is_rev; size_t off; size_t len; extract_pos_substr(pos_for_substr, id, is_rev, off, len); cout << graph->pos_substr(id, is_rev, off, len) << endl; } if (edges_from) { vector<Edge> edges = graph->edges_from(node_id); for (auto& edge : edges) { cout << edge.from() << (edge.from_start()?"-":"+") << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; } } if (edges_to) { vector<Edge> edges = graph->edges_to(node_id); for (auto& edge : edges) { cout << edge.from() << (edge.from_start()?"-":"+") << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; } } if (edges_of) { vector<Edge> edges = graph->edges_of(node_id); for (auto& edge : edges) { cout << edge.from() << (edge.from_start()?"-":"+") << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; } } if (edges_on_start) { vector<Edge> edges = graph->edges_on_start(node_id); for (auto& edge : edges) { cout << edge.from() << (edge.from_start()?"-":"+") << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; } } if (edges_on_end) { vector<Edge> edges = graph->edges_on_end(node_id); for (auto& edge : edges) { cout << edge.from() << (edge.from_start()?"-":"+") << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; } } if (node_context) { Graph g; graph->neighborhood(node_id, context_steps, g); if (text_output) { to_text(cout, g); } else { vector<Graph> gb = { g }; stream::write_buffered(cout, gb, 0); } } if (!target.empty()) { string name; int64_t start, end; Graph g; parse_region(target, name, start, end); graph->get_path_range(name, start, end, g); graph->expand_context(g, context_steps); if (text_output) { to_text(cout, g); } else { vector<Graph> gb = { g }; stream::write_buffered(cout, gb, 0); } } if (extract_threads) { list<XG::thread_t> threads; for (auto& p : graph->extract_threads(false)) { for (auto& t : p.second) { threads.push_back(t); } } for (auto& p : graph->extract_threads(true)) { for (auto& t : p.second) { threads.push_back(t); } } size_t thread_number = 0; for(XG::thread_t& thread : threads) { // Convert to a Path Path path; for(XG::ThreadMapping& m : thread) { // Convert all the mappings Mapping mapping; mapping.mutable_position()->set_node_id(m.node_id); mapping.mutable_position()->set_is_reverse(m.is_reverse); *(path.add_mapping()) = mapping; } // Give each thread a name path.set_name("_thread_" + to_string(thread_number++)); // We need a Graph for serialization purposes. We do one chunk per // thread in case the threads are long. Graph g; *(g.add_path()) = path; // Dump the graph with its mappings. TODO: can we restrict these to // mappings to nodes we have already pulled out? Or pull out the // whole compressed graph? if (text_output) { to_text(cout, g); } else { vector<Graph> gb = { g }; stream::write_buffered(cout, gb, 0); } } } if (!b_array_name.empty()) { // Dump B array ofstream out; out.open(b_array_name.c_str()); graph->bs_dump(out); } return 0; }
void Aligner::align_internal(Alignment& alignment, vector<Alignment>* multi_alignments, Graph& g, int64_t pinned_node_id, bool pin_left, int32_t max_alt_alns, bool print_score_matrices) { // check input integrity if (pin_left && !pinned_node_id) { cerr << "error:[Aligner] cannot choose pinned end in non-pinned alignment" << endl; exit(EXIT_FAILURE); } if (multi_alignments && !pinned_node_id) { cerr << "error:[Aligner] multiple traceback is not valid in local alignment, only pinned and global" << endl; exit(EXIT_FAILURE); } if (!(multi_alignments) && max_alt_alns != 1) { cerr << "error:[Aligner] cannot specify maximum number of alignments in single alignment" << endl; exit(EXIT_FAILURE); } // alignment pinning algorithm is based on pinning in bottom right corner, if pinning in top // left we need to reverse all the sequences first and translate the alignment back later // create reversed objects if necessary Graph reversed_graph; string reversed_sequence; if (pin_left) { reversed_sequence.resize(alignment.sequence().length()); reverse_copy(alignment.sequence().begin(), alignment.sequence().end(), reversed_sequence.begin()); reverse_graph(g, reversed_graph); } // choose forward or reversed objects Graph* align_graph; string* align_sequence; if (pin_left) { align_graph = &reversed_graph; align_sequence = &reversed_sequence; } else { align_graph = &g; align_sequence = alignment.mutable_sequence(); } // convert into gssw graph and get the counterpart to pinned node (if pinning) gssw_node* pinned_node = nullptr; gssw_graph* graph = create_gssw_graph(*align_graph, pinned_node_id, &pinned_node); if (pinned_node_id & !pinned_node) { cerr << "error:[Aligner] pinned node for pinned alignment is not in graph" << endl; exit(EXIT_FAILURE); } // perform dynamic programming gssw_graph_fill(graph, (*align_sequence).c_str(), nt_table, score_matrix, gap_open, gap_extension, 15, 2); // traceback either from pinned position or optimal local alignment if (pinned_node) { // trace back pinned alignment gssw_graph_mapping** gms = gssw_graph_trace_back_pinned_multi (graph, pinned_node, max_alt_alns, (*align_sequence).c_str(), (*align_sequence).size(), nt_table, score_matrix, gap_open, gap_extension); if (pin_left) { // translate graph and mappings into original node space unreverse_graph(reversed_graph); for (int32_t i = 0; i < max_alt_alns; i++) { unreverse_graph_mapping(gms[i]); } } // convert optimal alignment and store it in the input Alignment object (in the multi alignment, // this will have been set to the first in the vector) if (gms[0]->score > 0) { // have a mapping, can just convert normally gssw_mapping_to_alignment(graph, gms[0], alignment, print_score_matrices); } else { // gssw will not identify mappings with 0 score, infer location based on pinning Mapping* mapping = alignment.mutable_path()->add_mapping(); mapping->set_rank(1); // locate at the end of the node Position* position = mapping->mutable_position(); position->set_node_id(pinned_node_id); position->set_offset(pin_left ? 0 : pinned_node->len); // soft clip Edit* edit = mapping->add_edit(); edit->set_to_length(alignment.sequence().length()); edit->set_sequence(alignment.sequence()); } if (multi_alignments) { // determine how many non-null alignments were returned int32_t num_non_null = max_alt_alns; for (int32_t i = 1; i < max_alt_alns; i++) { if (gms[i]->score <= 0) { num_non_null = i; break; } } // reserve to avoid illegal access errors that occur when the vector reallocates multi_alignments->reserve(num_non_null); // copy the primary alignment multi_alignments->emplace_back(alignment); // convert the alternate alignments and store them at the back of the vector (this will not // execute if we are doing single alignment) for (int32_t i = 1; i < num_non_null; i++) { gssw_graph_mapping* gm = gms[i]; // make new alignment object multi_alignments->emplace_back(); Alignment& next_alignment = multi_alignments->back(); // copy over sequence information from the primary alignment next_alignment.set_sequence(alignment.sequence()); next_alignment.set_quality(alignment.quality()); // get path of the alternate alignment gssw_mapping_to_alignment(graph, gm, next_alignment, print_score_matrices); } } for (int32_t i = 0; i < max_alt_alns; i++) { gssw_graph_mapping_destroy(gms[i]); } free(gms); } else { // trace back local alignment gssw_graph_mapping* gm = gssw_graph_trace_back (graph, (*align_sequence).c_str(), (*align_sequence).size(), nt_table, score_matrix, gap_open, gap_extension); gssw_mapping_to_alignment(graph, gm, alignment, print_score_matrices); gssw_graph_mapping_destroy(gm); } //gssw_graph_print_score_matrices(graph, sequence.c_str(), sequence.size(), stderr); gssw_graph_destroy(graph); }
void Aligner::gssw_mapping_to_alignment(gssw_graph* graph, gssw_graph_mapping* gm, Alignment& alignment, bool print_score_matrices) { alignment.clear_path(); alignment.set_score(gm->score); alignment.set_query_position(0); Path* path = alignment.mutable_path(); //alignment.set_cigar(graph_cigar(gm)); gssw_graph_cigar* gc = &gm->cigar; gssw_node_cigar* nc = gc->elements; int to_pos = 0; int from_pos = gm->position; //cerr << "gm->position " << gm->position << endl; string& to_seq = *alignment.mutable_sequence(); //cerr << "-------------" << endl; if (print_score_matrices) { gssw_graph_print_score_matrices(graph, to_seq.c_str(), to_seq.size(), stderr); //cerr << alignment.DebugString() << endl; } for (int i = 0; i < gc->length; ++i, ++nc) { if (i > 0) from_pos = 0; // reset for each node after the first // check that the current alignment has a non-zero length gssw_cigar* c = nc->cigar; int l = c->length; if (l == 0) continue; gssw_cigar_element* e = c->elements; Node* from_node = (Node*) nc->node->data; string& from_seq = *from_node->mutable_sequence(); Mapping* mapping = path->add_mapping(); mapping->mutable_position()->set_node_id(nc->node->id); mapping->mutable_position()->set_offset(from_pos); mapping->set_rank(path->mapping_size()); //cerr << from_node->id() << ":" << endl; for (int j=0; j < l; ++j, ++e) { Edit* edit; int32_t length = e->length; //cerr << e->length << e->type << endl; switch (e->type) { case 'M': case 'X': case 'N': { // do the sequences match? // emit a stream of "SNPs" and matches int h = from_pos; int last_start = from_pos; int k = to_pos; for ( ; h < from_pos + length; ++h, ++k) { //cerr << h << ":" << k << " " << from_seq[h] << " " << to_seq[k] << endl; if (from_seq[h] != to_seq[k]) { // emit the last "match" region if (h-last_start > 0) { edit = mapping->add_edit(); edit->set_from_length(h-last_start); edit->set_to_length(h-last_start); } // set up the SNP edit = mapping->add_edit(); edit->set_from_length(1); edit->set_to_length(1); edit->set_sequence(to_seq.substr(k,1)); last_start = h+1; } } // handles the match at the end or the case of no SNP if (h-last_start > 0) { edit = mapping->add_edit(); edit->set_from_length(h-last_start); edit->set_to_length(h-last_start); } to_pos += length; from_pos += length; } break; case 'D': edit = mapping->add_edit(); edit->set_from_length(length); edit->set_to_length(0); from_pos += length; break; case 'I': edit = mapping->add_edit(); edit->set_from_length(0); edit->set_to_length(length); edit->set_sequence(to_seq.substr(to_pos, length)); to_pos += length; break; case 'S': // note that soft clips and insertions are semantically equivalent // and can only be differentiated by their position in the read // with soft clips coming at the start or end edit = mapping->add_edit(); edit->set_from_length(0); edit->set_to_length(length); edit->set_sequence(to_seq.substr(to_pos, length)); to_pos += length; break; default: cerr << "error:[Aligner::gssw_mapping_to_alignment] " << "unsupported cigar op type " << e->type << endl; exit(1); break; } } //cerr << "path to_length " << path_to_length(*path) << endl; } // set identity alignment.set_identity(identity(alignment.path())); }