Esempio n. 1
0
void Caller::create_snp_path(int64_t snp_node, bool secondary_snp) {

    // for now we don't write secdonary snp, so we have 1 path per *site*
    // and counting paths will give us somethign comparable to snp count
    // from bcftools
    if (!secondary_snp) {
        stringstream name;
        name << "SNP_" << snp_node;

        Mapping mapping;
        Position* pos = mapping.mutable_position();
        // make path that covers node forward with no edits.  not super
        // useful but will use to count snps... 
        pos->set_node_id(snp_node);
        pos->set_offset(0);
        mapping.mutable_position()->set_is_reverse(false);
        
        // note: create_path doesn't seem to work.. too rushed to look into
        //list<Mapping>& mappings = _call_graph.paths.create_path(name.str());

        list<Mapping> mappings;
        mappings.push_back(mapping);
        _call_graph.paths._paths.insert(make_pair(name.str(), mappings));
    }
}
Esempio n. 2
0
// generates a perfect alignment from the graph
Alignment Sampler::alignment(size_t length) {
    string seq;
    Alignment aln;
    Path* path = aln.mutable_path();
    pos_t pos = position();
    char c = pos_char(pos);
    // we do something wildly inefficient but conceptually clean
    // for each position in the mapping we add a mapping
    // at the end we will simplify the alignment, merging redundant mappings
    do {
        // add in the char for the current position
        seq += c;
        Mapping* mapping = path->add_mapping();
        *mapping->mutable_position() = make_position(pos);
        Edit* edit = mapping->add_edit();
        edit->set_from_length(1);
        edit->set_to_length(1);
        // decide the next position
        auto nextc = next_pos_chars(pos);
        // no new positions mean we are done; we've reached the end of the graph
        if (nextc.empty()) break;
        // what positions do we go to next?
        vector<pos_t> nextp;
        for (auto& n : nextc) nextp.push_back(n.first);
        // pick one at random
        uniform_int_distribution<int> next_dist(0, nextc.size()-1);
        // update our position
        pos = nextp.at(next_dist(rng));
        // update our char
        c = nextc[pos];
    } while (seq.size() < length);
    // save our sequence in the alignment
    aln.set_sequence(seq);
    aln = simplify(aln);
    { // name the alignment
        string data;
        aln.SerializeToString(&data);
        int n;
#pragma omp critical(nonce)
        n = nonce++;
        data += std::to_string(n);
        const string hash = sha1head(data, 16);
        aln.set_name(hash);
    }
    // and simplify it
    aln.set_identity(identity(aln.path()));
    return aln;
}
Esempio n. 3
0
int main_find(int argc, char** argv) {

    if (argc == 2) {
        help_find(argv);
        return 1;
    }

    string db_name;
    string sequence;
    int kmer_size=0;
    int kmer_stride = 1;
    vector<string> kmers;
    vector<vg::id_t> node_ids;
    string node_list_file;
    int context_size=0;
    bool use_length = false;
    bool count_kmers = false;
    bool kmer_table = false;
    vector<string> targets;
    string path_name;
    bool position_in = false;
    bool rank_in = false;
    string range;
    string gcsa_in;
    string xg_name;
    bool get_mems = false;
    int mem_reseed_length = 0;
    bool use_fast_reseed = true;
    bool get_alignments = false;
    bool get_mappings = false;
    string node_id_range;
    string aln_on_id_range;
    vg::id_t start_id = 0;
    vg::id_t end_id = 0;
    bool pairwise_distance = false;
    string haplotype_alignments;
    string gam_file;
    int max_mem_length = 0;
    int min_mem_length = 1;
    string to_graph_file;
    bool extract_threads = false;
    vector<string> extract_patterns;
    vg::id_t approx_id = 0;

    int c;
    optind = 2; // force optind past command positional argument
    while (true) {
        static struct option long_options[] =
            {
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"db-name", required_argument, 0, 'd'},
                {"xg-name", required_argument, 0, 'x'},
                {"gcsa", required_argument, 0, 'g'},
                {"node", required_argument, 0, 'n'},
                {"node-list", required_argument, 0, 'N'},
                {"edges-end", required_argument, 0, 'e'},
                {"edges-start", required_argument, 0, 's'},
                {"kmer", required_argument, 0, 'k'},
                {"table", no_argument, 0, 'T'},
                {"sequence", required_argument, 0, 'S'},
                {"mems", required_argument, 0, 'M'},
                {"reseed-length", required_argument, 0, 'B'},
                {"fast-reseed", no_argument, 0, 'f'},
                {"kmer-stride", required_argument, 0, 'j'},
                {"kmer-size", required_argument, 0, 'z'},
                {"context", required_argument, 0, 'c'},
                {"use-length", no_argument, 0, 'L'},
                {"kmer-count", no_argument, 0, 'C'},
                {"path", required_argument, 0, 'p'},
                {"position-in", required_argument, 0, 'P'},
                {"rank-in", required_argument, 0, 'R'},
                {"node-range", required_argument, 0, 'r'},
                {"alignments", no_argument, 0, 'a'},
                {"mappings", no_argument, 0, 'm'},
                {"alns-in", required_argument, 0, 'i'},
                {"alns-on", required_argument, 0, 'o'},
                {"distance", no_argument, 0, 'D'},
                {"haplotypes", required_argument, 0, 'H'},
                {"gam", required_argument, 0, 'G'},
                {"to-graph", required_argument, 0, 'A'},
                {"max-mem", required_argument, 0, 'Y'},
                {"min-mem", required_argument, 0, 'Z'},
                {"extract-threads", no_argument, 0, 't'},
                {"threads-named", required_argument, 0, 'q'},
                {"approx-pos", required_argument, 0, 'X'},
                {0, 0, 0, 0}
            };

        int option_index = 0;
        c = getopt_long (argc, argv, "d:x:n:e:s:o:k:hc:LS:z:j:CTp:P:r:amg:M:R:B:fi:DH:G:N:A:Y:Z:tq:X:",
                         long_options, &option_index);

        // Detect the end of the options.
        if (c == -1)
            break;

        switch (c)
        {
        case 'd':
            db_name = optarg;
            break;

        case 'x':
            xg_name = optarg;
            break;

        case 'g':
            gcsa_in = optarg;
            break;

        case 'k':
            kmers.push_back(optarg);
            break;

        case 'S':
            sequence = optarg;
            break;

        case 'M':
            sequence = optarg;
            get_mems = true;
            break;
            
        case 'B':
            mem_reseed_length = atoi(optarg);
            break;
            
        case 'f':
            use_fast_reseed = true;
            break;

        case 'Y':
            max_mem_length = atoi(optarg);
            break;
            
        case 'Z':
            min_mem_length = atoi(optarg);
            break;
            
        case 'j':
            kmer_stride = atoi(optarg);
            break;

        case 'z':
            kmer_size = atoi(optarg);
            break;

        case 'C':
            count_kmers = true;
            break;

        case 'p':
            targets.push_back(optarg);
            break;

        case 'P':
            path_name = optarg;
            position_in = true;
            break;

        case 'R':
            path_name = optarg;
            rank_in = true;
            break;

        case 'c':
            context_size = atoi(optarg);
            break;

        case 'L':
            use_length = true;
            break;

        case 'n':
            node_ids.push_back(atoi(optarg));
            break;

        case 'N':
            node_list_file = optarg;
            break;

        case 'e':
            end_id = atoi(optarg);
            break;

        case 's':
            start_id = atoi(optarg);
            break;

        case 'T':
            kmer_table = true;
            break;

        case 'r':
            range = optarg;
            break;

        case 'a':
            get_alignments = true;
            break;

        case 'i':
            node_id_range = optarg;
            break;

        case 'm':
            get_mappings = true;
            break;

        case 'o':
            aln_on_id_range = optarg;
            break;

        case 'D':
            pairwise_distance = true;
            break;

        case 'H':
            haplotype_alignments = optarg;
            break;

        case 't':
            extract_threads = true;
            break;

        case 'q':
            extract_threads = true;
            extract_patterns.push_back(optarg);
            break;

        case 'X':
            approx_id = atoi(optarg);
            break;

        case 'G':
            gam_file = optarg;
            break;

        case 'A':
            to_graph_file = optarg;
            break;

        case 'h':
        case '?':
            help_find(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }
    if (optind < argc) {
        cerr << "[vg find] find does not accept positional arguments" << endl;
        return 1;
    }

    if (db_name.empty() && gcsa_in.empty() && xg_name.empty()) {
        cerr << "[vg find] find requires -d, -g, or -x to know where to find its database" << endl;
        return 1;
    }

    if (context_size > 0 && use_length == true && xg_name.empty()) {
        cerr << "[vg find] error, -L not supported without -x" << endl;
        exit(1);
    }
    
    if (xg_name.empty() && mem_reseed_length) {
        cerr << "error:[vg find] SMEM reseeding requires an XG index. Provide XG index with -x." << endl;
        exit(1);
    }
    
    // process input node list
    if (!node_list_file.empty()) {
        ifstream nli;
        nli.open(node_list_file);
        if (!nli.good()){
            cerr << "[vg find] error, unable to open the node list input file." << endl;
            exit(1);
        }
        string line;
        while (getline(nli, line)){
            for (auto& idstr : split_delims(line, " \t")) {
                node_ids.push_back(atol(idstr.c_str()));
            }
        }
        nli.close();
    }

    // open index
    Index* vindex = nullptr;
    if (db_name.empty()) {
        assert(!gcsa_in.empty() || !xg_name.empty());
    } else {
        vindex = new Index;
        vindex->open_read_only(db_name);
    }

    xg::XG xindex;
    if (!xg_name.empty()) {
        ifstream in(xg_name.c_str());
        xindex.load(in);
    }

    if (get_alignments) {
        assert(!db_name.empty());
        vector<Alignment> output_buf;
        auto lambda = [&output_buf](const Alignment& aln) {
            output_buf.push_back(aln);
            stream::write_buffered(cout, output_buf, 100);
        };
        vindex->for_each_alignment(lambda);
        stream::write_buffered(cout, output_buf, 0);
    }

    if (!node_id_range.empty()) {
        assert(!db_name.empty());
        vector<string> parts = split_delims(node_id_range, ":");
        if (parts.size() == 1) {
            convert(parts.front(), start_id);
            end_id = start_id;
        } else {
            convert(parts.front(), start_id);
            convert(parts.back(), end_id);
        }
        vector<Alignment> output_buf;
        auto lambda = [&output_buf](const Alignment& aln) {
            output_buf.push_back(aln);
            stream::write_buffered(cout, output_buf, 100);
        };
        vindex->for_alignment_in_range(start_id, end_id, lambda);
        stream::write_buffered(cout, output_buf, 0);
    }

    if (!aln_on_id_range.empty()) {
        assert(!db_name.empty());
        vector<string> parts = split_delims(aln_on_id_range, ":");
        if (parts.size() == 1) {
            convert(parts.front(), start_id);
            end_id = start_id;
        } else {
            convert(parts.front(), start_id);
            convert(parts.back(), end_id);
        }
        vector<vg::id_t> ids;
        for (auto i = start_id; i <= end_id; ++i) {
            ids.push_back(i);
        }
        vector<Alignment> output_buf;
        auto lambda = [&output_buf](const Alignment& aln) {
            output_buf.push_back(aln);
            stream::write_buffered(cout, output_buf, 100);
        };
        vindex->for_alignment_to_nodes(ids, lambda);
        stream::write_buffered(cout, output_buf, 0);
    }

    if (!to_graph_file.empty()) {
        assert(vindex != nullptr);
        ifstream tgi(to_graph_file);
        VG graph(tgi);
        vector<vg::id_t> ids;
        graph.for_each_node([&](Node* n) { ids.push_back(n->id()); });
        vector<Alignment> output_buf;
        auto lambda = [&output_buf](const Alignment& aln) {
            output_buf.push_back(aln);
            stream::write_buffered(cout, output_buf, 100);
        };
        vindex->for_alignment_to_nodes(ids, lambda);
        stream::write_buffered(cout, output_buf, 0);
    }

    if (!xg_name.empty()) {
        if (!node_ids.empty() && path_name.empty() && !pairwise_distance) {
            // get the context of the node
            vector<Graph> graphs;
            set<vg::id_t> ids;
            for (auto node_id : node_ids) ids.insert(node_id);
            for (auto node_id : node_ids) {
                Graph g;
                xindex.neighborhood(node_id, context_size, g, !use_length);
                if (context_size == 0) {
                    for (auto& edge : xindex.edges_of(node_id)) {
                        // if both ends of the edge are in our targets, keep them
                        if (ids.count(edge.to()) && ids.count(edge.from())) {
                            *g.add_edge() = edge;
                        }
                    }
                }
                graphs.push_back(g);
            }
            VG result_graph;
            for (auto& graph : graphs) {
                // Allow duplicate nodes and edges (from e.g. multiple -n options); silently collapse them.
                result_graph.extend(graph);
            }
            result_graph.remove_orphan_edges();
            
            // Order the mappings by rank. TODO: how do we handle breaks between
            // different sections of a path with a single name?
            result_graph.paths.sort_by_mapping_rank();
            
            // return it
            result_graph.serialize_to_ostream(cout);
        } else if (end_id != 0) {
            for (auto& e : xindex.edges_on_end(end_id)) {
                cout << (e.from_start() ? -1 : 1) * e.from() << "\t" <<  (e.to_end() ? -1 : 1) * e.to() << endl;
            }
        } else if (start_id != 0) {
            for (auto& e : xindex.edges_on_start(start_id)) {
                cout << (e.from_start() ? -1 : 1) * e.from() << "\t" <<  (e.to_end() ? -1 : 1) * e.to() << endl;
            }
        }
        if (!node_ids.empty() && !path_name.empty() && !pairwise_distance && (position_in || rank_in)) {
            // Go get the positions of these nodes in this path
            
            if (xindex.path_rank(path_name) == 0) {
                // This path doesn't exist, and we'll get a segfault or worse if
                // we go look for positions in it.
                cerr << "[vg find] error, path \"" << path_name << "\" not found in index" << endl;
                exit(1);
            }
            
            // Note: this isn't at all consistent with -P option with rocksdb, which couts a range
            // and then mapping, but need this info right now for scripts/chunked_call
            for (auto node_id : node_ids) {
                cout << node_id;
                for (auto r : (position_in ? xindex.position_in_path(node_id, path_name)
                               : xindex.node_ranks_in_path(node_id, path_name))) {
                    cout << "\t" << r;
                }
                cout << endl;
            }
        }
        if (pairwise_distance) {
            if (node_ids.size() != 2) {
                cerr << "[vg find] error, exactly 2 nodes (-n) required with -D" << endl;
                exit(1);
            }
            cout << xindex.min_approx_path_distance(node_ids[0], node_ids[1]) << endl;
            return 0;
        }
        if (approx_id != 0) {
            cout << xindex.node_start(approx_id) << endl;
            return 0;
        }
        if (!targets.empty()) {
            Graph graph;
            for (auto& target : targets) {
                // Grab each target region
                string name;
                int64_t start, end;
                xg::parse_region(target, name, start, end);
                if(xindex.path_rank(name) == 0) {
                    // Passing a nonexistent path to get_path_range produces Undefined Behavior
                    cerr << "[vg find] error, path " << name << " not found in index" << endl;
                    exit(1);
                }
                // no coordinates given, we do whole thing (0,-1)
                if (start < 0 && end < 0) {
                    start = 0;
                }
                xindex.get_path_range(name, start, end, graph);
            }
            if (context_size > 0) {
                xindex.expand_context(graph, context_size, true, !use_length);
            }
            VG vgg; vgg.extend(graph); // removes dupes
            
            // Order the mappings by rank. TODO: how do we handle breaks between
            // different sections of a path with a single name?
            vgg.paths.sort_by_mapping_rank();
            
            vgg.serialize_to_ostream(cout);
        }
        if (!range.empty()) {
            Graph graph;
            int64_t id_start=0, id_end=0;
            vector<string> parts = split_delims(range, ":");
            if (parts.size() == 1) {
                cerr << "[vg find] error, format of range must be \"N:M\" where start id is N and end id is M, got " << range << endl;
                exit(1);
            }
            convert(parts.front(), id_start);
            convert(parts.back(), id_end);
            if (!use_length) {
                xindex.get_id_range(id_start, id_end, graph);
            } else {
                // treat id_end as length instead.
                xindex.get_id_range_by_length(id_start, id_end, graph, true);
            }
            if (context_size > 0) {
                xindex.expand_context(graph, context_size, true, !use_length);
            }
            VG vgg; vgg.extend(graph); // removes dupes
            vgg.remove_orphan_edges();
            vgg.serialize_to_ostream(cout);
        }
        if(!haplotype_alignments.empty()) {
            // What should we do with each alignment?
            function<void(Alignment&)> lambda = [&xindex](Alignment& aln) {
                // Count the amtches to the path. The path might be empty, in
                // which case it will yield the biggest size_t you can have.
                size_t matches = xindex.count_matches(aln.path());

                // We do this single-threaded, at least for now, so we don't
                // need to worry about coordinating output, and we can just
                // spit out the counts as bare numbers.
                cout << matches << endl;
            };
            if (haplotype_alignments == "-") {
                stream::for_each(std::cin, lambda);
            } else {
                ifstream in;
                in.open(haplotype_alignments.c_str());
                if(!in.is_open()) {
                    cerr << "[vg find] error: could not open alignments file " << haplotype_alignments << endl;
                    exit(1);
                }
                stream::for_each(in, lambda);
            }

        }
        if (extract_threads) {
            size_t thread_number = 0;
            bool extract_reverse = false;
            map<string, list<xg::XG::thread_t> > threads;
            if (extract_patterns.empty()) {
                threads = xindex.extract_threads(extract_reverse);
            } else {
                for (auto& pattern : extract_patterns) {
                    for (auto& t : xindex.extract_threads_matching(pattern, extract_reverse)) {
                        threads[t.first] = t.second;
                    }
                }
            }
            for(auto t : threads) {
                // Convert to a Path
                auto& thread = *t.second.begin();
                auto& thread_name = t.first;
                Path path;
                for(xg::XG::ThreadMapping& m : thread) {
                    // Convert all the mappings
                    Mapping mapping;
                    mapping.mutable_position()->set_node_id(m.node_id);
                    mapping.mutable_position()->set_is_reverse(m.is_reverse);
                    
                    *(path.add_mapping()) = mapping;
                }

                // Get each thread's name
                path.set_name(thread_name);
                // Give each thread a name
                //path.set_name("_thread_" + to_string(thread_number++));

                // We need a Graph for serialization purposes. We do one chunk per
                // thread in case the threads are long.
                Graph g;
                *(g.add_path()) = path;

                // Dump the graph with its mappings. TODO: can we restrict these to
                vector<Graph> gb = { g };
                stream::write_buffered(cout, gb, 0);
            }
        }
        if (!gam_file.empty()) {
            set<vg::id_t> nodes;
            function<void(Alignment&)> lambda = [&nodes](Alignment& aln) {
                // accumulate nodes matched by the path
                auto& path = aln.path();
                for (int i = 0; i < path.mapping_size(); ++i) {
                    nodes.insert(path.mapping(i).position().node_id());
                }
            };
            if (gam_file == "-") {
                stream::for_each(std::cin, lambda);
            } else {
                ifstream in;
                in.open(gam_file.c_str());
                if(!in.is_open()) {
                    cerr << "[vg find] error: could not open alignments file " << gam_file << endl;
                    exit(1);
                }
                stream::for_each(in, lambda);
            }
            // now we have the nodes to get
            Graph graph;
            for (auto& node : nodes) {
                *graph.add_node() = xindex.node(node);
            }
            xindex.expand_context(graph, max(1, context_size), true); // get connected edges
            VG vgg; vgg.extend(graph);
            vgg.serialize_to_ostream(cout);
        }
    } else if (!db_name.empty()) {
        if (!node_ids.empty() && path_name.empty()) {
            // get the context of the node
            vector<VG> graphs;
            for (auto node_id : node_ids) {
                VG g;
                vindex->get_context(node_id, g);
                if (context_size > 0) {
                    vindex->expand_context(g, context_size);
                }
                graphs.push_back(g);
            }
            VG result_graph;
            for (auto& graph : graphs) {
                // Allow duplicate nodes and edges (from e.g. multiple -n options); silently collapse them.
                result_graph.extend(graph);
            }
            result_graph.remove_orphan_edges();
            // return it
            result_graph.serialize_to_ostream(cout);
        } else if (end_id != 0) {
            vector<Edge> edges;
            vindex->get_edges_on_end(end_id, edges);
            for (vector<Edge>::iterator e = edges.begin(); e != edges.end(); ++e) {
                cout << (e->from_start() ? -1 : 1) * e->from() << "\t" <<  (e->to_end() ? -1 : 1) * e->to() << endl;
            }
        } else if (start_id != 0) {
            vector<Edge> edges;
            vindex->get_edges_on_start(start_id, edges);
            for (vector<Edge>::iterator e = edges.begin(); e != edges.end(); ++e) {
                cout << (e->from_start() ? -1 : 1) * e->from() << "\t" <<  (e->to_end() ? -1 : 1) * e->to() << endl;
            }
        }
        if (!node_ids.empty() && !path_name.empty()) {
            int64_t path_id = vindex->get_path_id(path_name);
            for (auto node_id : node_ids) {
                list<pair<int64_t, bool>> path_prev, path_next;
                int64_t prev_pos=0, next_pos=0;
                bool prev_backward, next_backward;
                if (vindex->get_node_path_relative_position(node_id, false, path_id,
                            path_prev, prev_pos, prev_backward,
                            path_next, next_pos, next_backward)) {

                    // Negate IDs for backward nodes
                    cout << node_id << "\t" << path_prev.front().first * (path_prev.front().second ? -1 : 1) << "\t" << prev_pos
                        << "\t" << path_next.back().first * (path_next.back().second ? -1 : 1) << "\t" << next_pos << "\t";

                    Mapping m = vindex->path_relative_mapping(node_id, false, path_id,
                            path_prev, prev_pos, prev_backward,
                            path_next, next_pos, next_backward);
                    cout << pb2json(m) << endl;
                }
            }
        }
        if (!targets.empty()) {
            VG graph;
            for (auto& target : targets) {
                string name;
                int64_t start, end;
                xg::parse_region(target, name, start, end);
                // end coordinate is exclusive for get_path()
                if (end >= 0) {
                    ++end;
                }
                vindex->get_path(graph, name, start, end);
            }
            if (context_size > 0) {
                vindex->expand_context(graph, context_size);
            }
            graph.remove_orphan_edges();
            graph.serialize_to_ostream(cout);
        }
        if (!range.empty()) {
            VG graph;
            int64_t id_start=0, id_end=0;
            vector<string> parts = split_delims(range, ":");
            if (parts.size() == 1) {
                cerr << "[vg find] error, format of range must be \"N:M\" where start id is N and end id is M, got " << range << endl;
                exit(1);
            }
            convert(parts.front(), id_start);
            convert(parts.back(), id_end);
            vindex->get_range(id_start, id_end, graph);
            if (context_size > 0) {
                vindex->expand_context(graph, context_size);
            }
            graph.remove_orphan_edges();
            graph.serialize_to_ostream(cout);
        }
    }

    // todo cleanup if/else logic to allow only one function

    if (!sequence.empty()) {
        if (gcsa_in.empty()) {
            if (get_mems) {
                cerr << "error:[vg find] a GCSA index must be passed to get MEMs" << endl;
                return 1;
            }
            set<int> kmer_sizes = vindex->stored_kmer_sizes();
            if (kmer_sizes.empty()) {
                cerr << "error:[vg find] index does not include kmers, add with vg index -k" << endl;
                return 1;
            }
            if (kmer_size == 0) {
                kmer_size = *kmer_sizes.begin();
            }
            for (int i = 0; i <= sequence.size()-kmer_size; i+=kmer_stride) {
                kmers.push_back(sequence.substr(i,kmer_size));
            }
        } else {
            // let's use the GCSA index

            // Configure GCSA2 verbosity so it doesn't spit out loads of extra info
            gcsa::Verbosity::set(gcsa::Verbosity::SILENT);
            
            // Configure its temp directory to the system temp directory
            gcsa::TempFile::setDirectory(find_temp_dir());

            // Open it
            ifstream in_gcsa(gcsa_in.c_str());
            gcsa::GCSA gcsa_index;
            gcsa_index.load(in_gcsa);
            gcsa::LCPArray lcp_index;
            // default LCP is the gcsa base name +.lcp
            string lcp_in = gcsa_in + ".lcp";
            ifstream in_lcp(lcp_in.c_str());
            lcp_index.load(in_lcp);
            //range_type find(const char* pattern, size_type length) const;
            //void locate(size_type path, std::vector<node_type>& results, bool append = false, bool sort = true) const;
            //locate(i, results);
            if (!get_mems) {
                auto paths = gcsa_index.find(sequence.c_str(), sequence.length());
                //cerr << paths.first << " - " << paths.second << endl;
                for (gcsa::size_type i = paths.first; i <= paths.second; ++i) {
                    std::vector<gcsa::node_type> ids;
                    gcsa_index.locate(i, ids);
                    for (auto id : ids) {
                        cout << gcsa::Node::decode(id) << endl;
                    }
                }
            } else {
                // for mems we need to load up the gcsa and lcp structures into the mapper
                Mapper mapper(&xindex, &gcsa_index, &lcp_index);
                mapper.fast_reseed = use_fast_reseed;
                // get the mems
                double lcp_max, fraction_filtered;
                auto mems = mapper.find_mems_deep(sequence.begin(), sequence.end(), lcp_max, fraction_filtered, max_mem_length, min_mem_length, mem_reseed_length);

                // dump them to stdout
                cout << mems_to_json(mems) << endl;

            }
        }
    }

    if (!kmers.empty()) {
        if (count_kmers) {
            for (auto& kmer : kmers) {
                cout << kmer << "\t" << vindex->approx_size_of_kmer_matches(kmer) << endl;
            }
        } else if (kmer_table) {
            for (auto& kmer : kmers) {
                map<string, vector<pair<int64_t, int32_t> > > positions;
                vindex->get_kmer_positions(kmer, positions);
                for (auto& k : positions) {
                    for (auto& p : k.second) {
                        cout << k.first << "\t" << p.first << "\t" << p.second << endl;
                    }
                }
            }
        } else {
            vector<VG> graphs;
            for (auto& kmer : kmers) {
                VG g;
                vindex->get_kmer_subgraph(kmer, g);
                if (context_size > 0) {
                    vindex->expand_context(g, context_size);
                }
                graphs.push_back(g);
            }

            VG result_graph;
            for (auto& graph : graphs) {
                // Allow duplicate nodes and edges (from multiple kmers); silently collapse them.
                result_graph.extend(graph);
            }
            result_graph.remove_orphan_edges();
            result_graph.serialize_to_ostream(cout);
        }
    }

    if (vindex) delete vindex;

    return 0;

}
Esempio n. 4
0
int main_xg(int argc, char** argv) {

    if (argc == 2) {
        help_xg(argv);
        return 1;
    }

    string vg_in;
    string vg_out;
    string out_name;
    string in_name;
    int64_t node_id;
    bool edges_from = false;
    bool edges_to = false;
    bool edges_of = false;
    bool edges_on_start = false;
    bool edges_on_end = false;
    bool node_sequence = false;
    string pos_for_char;
    string pos_for_substr;
    int context_steps = 0;
    bool node_context = false;
    string target;
    bool print_graph = false;
    bool text_output = false;
    bool validate_graph = false;
    bool extract_threads = false;
    bool store_threads = false;
    bool is_sorted_dag = false;
    string report_name;
    string b_array_name;
    
    int c;
    optind = 2; // force optind past "xg" positional argument
    while (true) {
        static struct option long_options[] =
            {
                {"help", no_argument, 0, 'h'},
                {"vg", required_argument, 0, 'v'},
                {"out", required_argument, 0, 'o'},
                {"in", required_argument, 0, 'i'},
                {"extract-vg", required_argument, 0, 'X'},
                {"node", required_argument, 0, 'n'},
                {"char", required_argument, 0, 'P'},
                {"substr", required_argument, 0, 'F'},
                //{"range", required_argument, 0, 'r'},
                {"context", required_argument, 0, 'c'},
                {"edges-from", required_argument, 0, 'f'},
                {"edges-to", required_argument, 0, 't'},
                {"edges-of", required_argument, 0, 'O'},
                {"edges-on-start", required_argument, 0, 'S'},
                {"edges-on-end", required_argument, 0, 'E'},
                {"node-seq", required_argument, 0, 's'},
                {"path", required_argument, 0, 'p'},
                {"extract-threads", no_argument, 0, 'x'},
                {"store-threads", no_argument, 0, 'r'},
                {"is-sorted-dag", no_argument, 0, 'd'},
                {"report", required_argument, 0, 'R'},
                {"debug", no_argument, 0, 'D'},
                {"text-output", no_argument, 0, 'T'},
                {"validate", no_argument, 0, 'V'},
                {"dump-bs", required_argument, 0, 'b'},
                {0, 0, 0, 0}
            };

        int option_index = 0;
        c = getopt_long (argc, argv, "hv:o:i:X:f:t:s:c:n:p:DxrdTO:S:E:VR:P:F:b:",
                         long_options, &option_index);

        // Detect the end of the options.
        if (c == -1)
            break;

        switch (c)
        {

        case 'v':
            vg_in = optarg;
            break;

        case 'V':
            validate_graph = true;
            break;

        case 'o':
            out_name = optarg;
            break;

        case 'D':
            print_graph = true;
            break;

        case 'T':
            text_output = true;
            break;
            
        case 'x':
            extract_threads = true;
            break;
            
        case 'r':
            store_threads = true;
            break;
            
        case 'd':
            is_sorted_dag = true;
            break;

        case 'i':
            in_name = optarg;
            break;

        case 'X':
            vg_out = optarg;
            break;

        case 'n':
            node_id = parse<int64_t>(optarg);
            node_context = true;
            break;

        case 'c':
            context_steps = parse<int>(optarg);
            break;

        case 'f':
            node_id = parse<int64_t>(optarg);
            edges_from = true;
            break;
            
        case 't':
            node_id = parse<int64_t>(optarg);
            edges_to = true;
            break;

        case 'O':
            node_id = parse<int64_t>(optarg);
            edges_of = true;
            break;

        case 'S':
            node_id = parse<int64_t>(optarg);
            edges_on_start = true;
            break;

        case 'E':
            node_id = parse<int64_t>(optarg);
            edges_on_end = true;
            break;

        case 's':
            node_id = parse<int64_t>(optarg);
            node_sequence = true;
            break;

        case 'p':
            target = optarg;
            break;

        case 'P':
            pos_for_char = optarg;
            break;
            
        case 'F':
            pos_for_substr = optarg;
            break;
            
        case 'R':
            report_name = optarg;
            break;
            
        case 'b':
            b_array_name = optarg;
            break;
            
        case 'h':
        case '?':
            help_xg(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    unique_ptr<XG> graph;
    //string file_name = argv[optind];
    if (in_name.empty()) assert(!vg_in.empty());
    if (vg_in == "-") {
        // Read VG from stdin
        graph = unique_ptr<XG>(new XG());
        graph->from_stream(std::cin, validate_graph, print_graph, store_threads, is_sorted_dag);
    } else if (vg_in.size()) {
        // Read VG from a file
        ifstream in;
        in.open(vg_in.c_str());
        graph = unique_ptr<XG>(new XG());
        graph->from_stream(in, validate_graph, print_graph, store_threads, is_sorted_dag);
    }

    if (in_name.size()) {
        get_input_file(in_name, [&](istream& in) {
            // Load from an XG file or - (stdin)
            graph = stream::VPKG::load_one<XG>(in);
        });
    }

    // Prepare structure tree for serialization
    unique_ptr<sdsl::structure_tree_node> structure;
    
    if (!report_name.empty()) {
        // We need to make a report, so we need the structure. Make a real tree
        // node. The unique_ptr handles deleting.
        structure = unique_ptr<sdsl::structure_tree_node>(new sdsl::structure_tree_node("name", "type"));
    }

    if(!vg_out.empty()) {
        if (graph.get() == nullptr) {
             cerr << "error [vg xg] no xg graph exists to convert; Try: vg xg -i graph.xg -X graph.vg" << endl;
             return 1;
        }
        
        VG converted;
        // Convert the xg graph to vg format
        convert_handle_graph(graph.get(), &converted);
        
        // TODO: The converter doesn't copy circular paths yet.
        // When it does, we can remove all this path copying code.

        // Make a raw Proto Graph to hold Path objects
        Graph path_graph;

        // Since paths are not copied, copy the paths.
        for (size_t rank = 1; rank <= graph->max_path_rank(); rank++) {
            // Extract each path into the path graph
            *path_graph.add_path() = graph->path(graph->path_name(rank));
        }

        // Merge in all the paths
        converted.extend(path_graph);
        
        if (vg_out == "-") {
            converted.serialize_to_ostream(std::cout);
        } else {
            converted.serialize_to_file(vg_out);
        }
    }

    if (!out_name.empty()) {
        // Open a destination file if it is a file we want to write to
        ofstream out_file;
        if (out_name != "-") {
            out_file.open(out_name);
        }
        // Work out where to save to
        ostream& out = (out_name == "-") ? std::cout : out_file;
        
        // Encapsulate output in VPKG
        stream::VPKG::with_save_stream(out, "XG", [&](ostream& tagged) {
            // Serialize to the file while recording space usage to the structure.
            graph->serialize(tagged, structure.get(), "xg");
        });
        
        out.flush();
    }

    if (!report_name.empty()) {
        // Save the report
        ofstream out;
        out.open(report_name.c_str());
        sdsl::write_structure_tree<HTML_FORMAT>(structure.get(), out, 0);
    }

    // queries
    if (node_sequence) {
        cout << node_id << ": " << graph->node_sequence(node_id) << endl;
    }
    if (!pos_for_char.empty()) {
        // extract the position from the string
        int64_t id;
        bool is_rev;
        size_t off;
        extract_pos(pos_for_char, id, is_rev, off);
        // then pick it up from the graph
        cout << graph->pos_char(id, is_rev, off) << endl;
    }
    if (!pos_for_substr.empty()) {
        int64_t id;
        bool is_rev;
        size_t off;
        size_t len;
        extract_pos_substr(pos_for_substr, id, is_rev, off, len);
        cout << graph->pos_substr(id, is_rev, off, len) << endl;
    }
    
    if (edges_from) {
        vector<Edge> edges = graph->edges_from(node_id);
        for (auto& edge : edges) {
            cout << edge.from() << (edge.from_start()?"-":"+")
                 << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl;
        }
    }
    if (edges_to) {
        vector<Edge> edges = graph->edges_to(node_id);
        for (auto& edge : edges) {
            cout << edge.from() << (edge.from_start()?"-":"+")
                 << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl;
        }
    }
    if (edges_of) {
        vector<Edge> edges = graph->edges_of(node_id);
        for (auto& edge : edges) {
            cout << edge.from() << (edge.from_start()?"-":"+")
                 << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl;
        }
    }
    if (edges_on_start) {
        vector<Edge> edges = graph->edges_on_start(node_id);
        for (auto& edge : edges) {
            cout << edge.from() << (edge.from_start()?"-":"+")
                 << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl;
        }
    }
    if (edges_on_end) {
        vector<Edge> edges = graph->edges_on_end(node_id);
        for (auto& edge : edges) {
            cout << edge.from() << (edge.from_start()?"-":"+")
                 << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl;
        }
    }

    if (node_context) {
        Graph g;
        graph->neighborhood(node_id, context_steps, g);
        if (text_output) {
            to_text(cout, g);
        } else {
            vector<Graph> gb = { g };
            stream::write_buffered(cout, gb, 0);
        }
    }

    if (!target.empty()) {
        string name;
        int64_t start, end;
        Graph g;
        parse_region(target, name, start, end);
        graph->get_path_range(name, start, end, g);
        graph->expand_context(g, context_steps);
        if (text_output) {
            to_text(cout, g);
        } else {
            vector<Graph> gb = { g };
            stream::write_buffered(cout, gb, 0);
        }
    }
    
    if (extract_threads) {
        list<XG::thread_t> threads;
        for (auto& p : graph->extract_threads(false)) {
            for (auto& t : p.second) {
                threads.push_back(t);
            }
        }
        for (auto& p : graph->extract_threads(true)) {
            for (auto& t : p.second) {
                threads.push_back(t);
            }
        }

        size_t thread_number = 0;
        for(XG::thread_t& thread : threads) {
            // Convert to a Path
            Path path;
            for(XG::ThreadMapping& m : thread) {
                // Convert all the mappings
                Mapping mapping;
                mapping.mutable_position()->set_node_id(m.node_id);
                mapping.mutable_position()->set_is_reverse(m.is_reverse);
                
                *(path.add_mapping()) = mapping;
            }
        
        
            // Give each thread a name
            path.set_name("_thread_" + to_string(thread_number++));
            
            // We need a Graph for serialization purposes. We do one chunk per
            // thread in case the threads are long.
            Graph g;
            
            *(g.add_path()) = path;
            
            // Dump the graph with its mappings. TODO: can we restrict these to
            // mappings to nodes we have already pulled out? Or pull out the
            // whole compressed graph?
            if (text_output) {
                to_text(cout, g);
            } else {
                vector<Graph> gb = { g };
                stream::write_buffered(cout, gb, 0);
            }
            
        }
    }

    if (!b_array_name.empty()) {
        // Dump B array
        ofstream out;
        out.open(b_array_name.c_str());
        graph->bs_dump(out);
    }

    return 0;
}
Esempio n. 5
0
void Aligner::align_internal(Alignment& alignment, vector<Alignment>* multi_alignments, Graph& g,
                             int64_t pinned_node_id, bool pin_left, int32_t max_alt_alns, bool print_score_matrices) {

    // check input integrity
    if (pin_left && !pinned_node_id) {
        cerr << "error:[Aligner] cannot choose pinned end in non-pinned alignment" << endl;
        exit(EXIT_FAILURE);
    }
    if (multi_alignments && !pinned_node_id) {
        cerr << "error:[Aligner] multiple traceback is not valid in local alignment, only pinned and global" << endl;
        exit(EXIT_FAILURE);
    }
    if (!(multi_alignments) && max_alt_alns != 1) {
        cerr << "error:[Aligner] cannot specify maximum number of alignments in single alignment" << endl;
        exit(EXIT_FAILURE);
    }


    // alignment pinning algorithm is based on pinning in bottom right corner, if pinning in top
    // left we need to reverse all the sequences first and translate the alignment back later

    // create reversed objects if necessary
    Graph reversed_graph;
    string reversed_sequence;
    if (pin_left) {
        reversed_sequence.resize(alignment.sequence().length());

        reverse_copy(alignment.sequence().begin(), alignment.sequence().end(), reversed_sequence.begin());
        reverse_graph(g, reversed_graph);
    }

    // choose forward or reversed objects
    Graph* align_graph;
    string* align_sequence;
    if (pin_left) {
        align_graph = &reversed_graph;
        align_sequence = &reversed_sequence;
    }
    else {
        align_graph = &g;
        align_sequence = alignment.mutable_sequence();
    }

    // convert into gssw graph and get the counterpart to pinned node (if pinning)
    gssw_node* pinned_node = nullptr;
    gssw_graph* graph = create_gssw_graph(*align_graph, pinned_node_id, &pinned_node);

    if (pinned_node_id & !pinned_node) {
        cerr << "error:[Aligner] pinned node for pinned alignment is not in graph" << endl;
        exit(EXIT_FAILURE);
    }

    // perform dynamic programming
    gssw_graph_fill(graph, (*align_sequence).c_str(),
                    nt_table, score_matrix,
                    gap_open, gap_extension, 15, 2);

    // traceback either from pinned position or optimal local alignment
    if (pinned_node) {
        // trace back pinned alignment
        gssw_graph_mapping** gms = gssw_graph_trace_back_pinned_multi (graph,
                                   pinned_node,
                                   max_alt_alns,
                                   (*align_sequence).c_str(),
                                   (*align_sequence).size(),
                                   nt_table,
                                   score_matrix,
                                   gap_open,
                                   gap_extension);

        if (pin_left) {
            // translate graph and mappings into original node space
            unreverse_graph(reversed_graph);
            for (int32_t i = 0; i < max_alt_alns; i++) {
                unreverse_graph_mapping(gms[i]);
            }
        }

        // convert optimal alignment and store it in the input Alignment object (in the multi alignment,
        // this will have been set to the first in the vector)
        if (gms[0]->score > 0) {
            // have a mapping, can just convert normally
            gssw_mapping_to_alignment(graph, gms[0], alignment, print_score_matrices);
        }
        else {
            // gssw will not identify mappings with 0 score, infer location based on pinning

            Mapping* mapping = alignment.mutable_path()->add_mapping();
            mapping->set_rank(1);

            // locate at the end of the node
            Position* position = mapping->mutable_position();
            position->set_node_id(pinned_node_id);
            position->set_offset(pin_left ? 0 : pinned_node->len);

            // soft clip
            Edit* edit = mapping->add_edit();
            edit->set_to_length(alignment.sequence().length());
            edit->set_sequence(alignment.sequence());
        }


        if (multi_alignments) {
            // determine how many non-null alignments were returned
            int32_t num_non_null = max_alt_alns;
            for (int32_t i = 1; i < max_alt_alns; i++) {
                if (gms[i]->score <= 0) {
                    num_non_null = i;
                    break;
                }
            }

            // reserve to avoid illegal access errors that occur when the vector reallocates
            multi_alignments->reserve(num_non_null);

            // copy the primary alignment
            multi_alignments->emplace_back(alignment);

            // convert the alternate alignments and store them at the back of the vector (this will not
            // execute if we are doing single alignment)
            for (int32_t i = 1; i < num_non_null; i++) {
                gssw_graph_mapping* gm = gms[i];

                // make new alignment object
                multi_alignments->emplace_back();
                Alignment& next_alignment = multi_alignments->back();

                // copy over sequence information from the primary alignment
                next_alignment.set_sequence(alignment.sequence());
                next_alignment.set_quality(alignment.quality());

                // get path of the alternate alignment
                gssw_mapping_to_alignment(graph, gm, next_alignment, print_score_matrices);

            }
        }

        for (int32_t i = 0; i < max_alt_alns; i++) {
            gssw_graph_mapping_destroy(gms[i]);
        }
        free(gms);
    }
    else {
        // trace back local alignment
        gssw_graph_mapping* gm = gssw_graph_trace_back (graph,
                                 (*align_sequence).c_str(),
                                 (*align_sequence).size(),
                                 nt_table,
                                 score_matrix,
                                 gap_open,
                                 gap_extension);

        gssw_mapping_to_alignment(graph, gm, alignment, print_score_matrices);
        gssw_graph_mapping_destroy(gm);
    }

    //gssw_graph_print_score_matrices(graph, sequence.c_str(), sequence.size(), stderr);

    gssw_graph_destroy(graph);

}
Esempio n. 6
0
void Aligner::gssw_mapping_to_alignment(gssw_graph* graph,
                                        gssw_graph_mapping* gm,
                                        Alignment& alignment,
                                        bool print_score_matrices) {
    alignment.clear_path();
    alignment.set_score(gm->score);
    alignment.set_query_position(0);
    Path* path = alignment.mutable_path();
    //alignment.set_cigar(graph_cigar(gm));

    gssw_graph_cigar* gc = &gm->cigar;
    gssw_node_cigar* nc = gc->elements;
    int to_pos = 0;
    int from_pos = gm->position;
    //cerr << "gm->position " << gm->position << endl;
    string& to_seq = *alignment.mutable_sequence();
    //cerr << "-------------" << endl;

    if (print_score_matrices) {
        gssw_graph_print_score_matrices(graph, to_seq.c_str(), to_seq.size(), stderr);
        //cerr << alignment.DebugString() << endl;
    }

    for (int i = 0; i < gc->length; ++i, ++nc) {

        if (i > 0) from_pos = 0; // reset for each node after the first
        // check that the current alignment has a non-zero length
        gssw_cigar* c = nc->cigar;
        int l = c->length;
        if (l == 0) continue;
        gssw_cigar_element* e = c->elements;

        Node* from_node = (Node*) nc->node->data;
        string& from_seq = *from_node->mutable_sequence();
        Mapping* mapping = path->add_mapping();
        mapping->mutable_position()->set_node_id(nc->node->id);
        mapping->mutable_position()->set_offset(from_pos);
        mapping->set_rank(path->mapping_size());

        //cerr << from_node->id() << ":" << endl;

        for (int j=0; j < l; ++j, ++e) {
            Edit* edit;
            int32_t length = e->length;
            //cerr << e->length << e->type << endl;

            switch (e->type) {
            case 'M':
            case 'X':
            case 'N': {
                // do the sequences match?
                // emit a stream of "SNPs" and matches
                int h = from_pos;
                int last_start = from_pos;
                int k = to_pos;
                for ( ; h < from_pos + length; ++h, ++k) {
                    //cerr << h << ":" << k << " " << from_seq[h] << " " << to_seq[k] << endl;
                    if (from_seq[h] != to_seq[k]) {
                        // emit the last "match" region
                        if (h-last_start > 0) {
                            edit = mapping->add_edit();
                            edit->set_from_length(h-last_start);
                            edit->set_to_length(h-last_start);
                        }
                        // set up the SNP
                        edit = mapping->add_edit();
                        edit->set_from_length(1);
                        edit->set_to_length(1);
                        edit->set_sequence(to_seq.substr(k,1));
                        last_start = h+1;
                    }
                }
                // handles the match at the end or the case of no SNP
                if (h-last_start > 0) {
                    edit = mapping->add_edit();
                    edit->set_from_length(h-last_start);
                    edit->set_to_length(h-last_start);
                }
                to_pos += length;
                from_pos += length;
            }
            break;
            case 'D':
                edit = mapping->add_edit();
                edit->set_from_length(length);
                edit->set_to_length(0);
                from_pos += length;
                break;
            case 'I':
                edit = mapping->add_edit();
                edit->set_from_length(0);
                edit->set_to_length(length);
                edit->set_sequence(to_seq.substr(to_pos, length));
                to_pos += length;
                break;
            case 'S':
                // note that soft clips and insertions are semantically equivalent
                // and can only be differentiated by their position in the read
                // with soft clips coming at the start or end
                edit = mapping->add_edit();
                edit->set_from_length(0);
                edit->set_to_length(length);
                edit->set_sequence(to_seq.substr(to_pos, length));
                to_pos += length;
                break;
            default:
                cerr << "error:[Aligner::gssw_mapping_to_alignment] "
                     << "unsupported cigar op type " << e->type << endl;
                exit(1);
                break;

            }

        }
        //cerr << "path to_length " << path_to_length(*path) << endl;
    }

    // set identity
    alignment.set_identity(identity(alignment.path()));
}