Esempio n. 1
0
int main(int argc, char* argv[])
{
    try
    {
        size_t kmerSize = 11;
        cost char* seq = "AGGCGCTAGGGTAGAGGATGATGA";
        
        std::cout
            << "The initial sequence is '"
            << seq
            << "', kmer size is"
            << kmerSize
            << std::endl;
        
        // We create the graph from a given sequence, and for a given kmer size
        Graph graph = Graph::create(
            new BankStrings( seq, NULL),
            "-kmer-size %d -abundance-min 1 -verbose 0", kmerSize
        );
        
        // Get the first node
        Node node = graph.buildNode(seq);
        
        // Create a node iterator that iterates over all the simple nodes from 
        // the first node. Rememeber that a simple node has inDeg = outDeg = 1
        Graph::Iterator<Node> path = 
            graph.simplePath<Node> (node, DIR_OUTCOMING);
        
        // Iterate over the simple path:
        for (path.start(); !path.isDone(); path.next())
        {
            std::cout
                << "    ["
                << path.rank()
                << "] current item is "
                << graph.toString(path.item())
                << std::endl;
        }
        
        std::cout
            << "The simple path was "
            << path.rank()
            << " long"
            << std::endl;
    }
    catch(Exception& e)
    {
        std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
    }
    
    return EXIT_SUCCESS;
}
Esempio n. 2
0
template<size_t span> struct debruijn_mphf_bench {  void operator ()  (Parameter params)
{
    typedef NodeFast<span> NodeFastT;
    typedef GraphTemplate<NodeFastT,EdgeFast<span>,GraphDataVariantFast<span>> GraphFast;

    size_t kmerSize = params.k;
    
    Graph graph; 
    GraphFast graphFast;
  
    if (params.seq == "") 
    {
        graph = Graph::create (params.args.c_str());
        graphFast = GraphFast::create (params.args.c_str());
    }
    else
    {
        graph = Graph::create (new BankStrings (params.seq.c_str(), 0), params.args.c_str());
        graphFast = GraphFast::create (new BankStrings (params.seq.c_str(), 0), params.args.c_str());

    }

    cout << "graph built, benchmarking.." << endl;

    
    int miniSize = 8;
    int NB_REPETITIONS = 2000000;

    double unit = 1000000000;
    cout.setf(ios_base::fixed);
    cout.precision(3);

    Graph::Iterator<Node> nodes = graph.iterator();
    typename GraphFast::template Iterator<NodeFastT> nodesFast = graphFast.iterator();
    nodes.first ();

    /** We get the first node. */
    Node node = nodes.item();

    typedef typename Kmer<span>::Type  Type;
    typedef typename Kmer<span>::ModelCanonical  ModelCanonical;
    typedef typename Kmer<span>::ModelDirect     ModelDirect;
    typedef typename Kmer<span>::template ModelMinimizer <ModelCanonical>   ModelMini;
    typedef typename ModelMini::Kmer                        KmerType;

    ModelMini  modelMini (kmerSize, miniSize);
    ModelCanonical  modelCanonical (kmerSize);

     // for some reason.. if *compiled*, this code confuses makes later MPHF queries 3x slower. really? yes. try to replace "if (confuse_mphf)" by "if (confuse_mphf && 0)" and re-run, you will see.
    {
        bool confuse_mphf = false;
        if (confuse_mphf)
        {
            //Type b; b.setVal(0); 
            //modelCanonical.emphf_hasher(modelCanonical.adaptor(b)); 
            //typedef std::pair<u_int8_t const*, u_int8_t const*> byte_range_t;

            //int c = 0; 
            //byte_range_t brange( reinterpret_cast <u_int8_t const*> (&c), reinterpret_cast <u_int8_t const*> (&c) + 2 );
            //byte_range_t brange( (u_int8_t const*) 1,(u_int8_t const*)33);
            //auto hashes = modelCanonical.empfh_hasher(brange);
        }


        for (int i = 0; i < 0; i++)
        {
            auto start_tt=chrono::system_clock::now();
            for (nodes.first(); !nodes.isDone(); nodes.next())
                modelCanonical.EMPHFhash(nodes.item().kmer.get<Type>());
            auto end_tt=chrono::system_clock::now();
            cout << "time to do " << nodes.size() << " computing EMPHFhash of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_tt, end_tt) / unit) << " seconds" << endl;
        }
        // it's slow. i don't understand why. see above for the "confuse mphf" part
        //return; //FIXME
    }


    /** We get the value of the first node (just an example, it's not used later). */
    Type kmer = node.kmer.get<Type>();
    
    auto start_t=chrono::system_clock::now();
    auto end_t=chrono::system_clock::now();
			
   cout << "----\non all nodes of the graph\n-----\n";

    /* disable node state (because we don't want to pay the price for overhea of checking whether a node is deleted or not in contain() */
   std::cout<< "PAY ATTENTION: this neighbor() benchmark, in the Bloom flavor, is without performing a MPHF query for each found node" << std::endl; 

   graph.disableNodeState(); 
   graphFast.disableNodeState(); 

   /* compute baseline times (= overheads we're not interested in) */

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
    {}
    end_t=chrono::system_clock::now();
    auto baseline_graph_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph nodes enumeration (" << nodes.size() << " nodes) : " << baseline_graph_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
    {}
    end_t=chrono::system_clock::now();
    auto baseline_graphfast_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph NodeFast enumeration (" << nodes.size() << " nodes) : " << baseline_graphfast_time << " seconds" << endl;


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        modelMini.getMinimizerValueDummy(nodes.item().kmer.get<Type>());
    end_t=chrono::system_clock::now();
    auto baseline_minim_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph nodes enumeration and minimizer computation setup (" << nodes.size() << " nodes) : " << baseline_minim_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        nodes.item().getKmer<Type>();
    end_t=chrono::system_clock::now();
    auto baseline_hash_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph nodes enumeration and hash computation setup (" << nodes.size() << " nodes) : " << baseline_hash_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
       nodesFast.item().kmer;
    end_t=chrono::system_clock::now();
    auto baseline_hashfast_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph NodeFast enumeration and hash computation setup (" << nodes.size() << " nodes) : " << baseline_hashfast_time << " seconds" << endl;


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.nodeMPHFIndexDummy(nodes.item());
    end_t=chrono::system_clock::now();
    auto baseline_mphf_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph nodes enumeration and mphf query setup (" << nodes.size() << " nodes) : " << baseline_mphf_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.nodeMPHFIndexDummy(nodesFast.item());
    end_t=chrono::system_clock::now();
    auto baseline_mphffast_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph NodeFast enumeration and mphf query setup (" << nodes.size() << " nodes) : " << baseline_mphffast_time << " seconds" << endl;


    /* do actual benchmark */


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), true);
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computations of minimizers (fast method) of length " << miniSize << " on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_minim_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.nodeMPHFIndex(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computations of MPHF index on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_mphf_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.nodeMPHFIndex(nodesFast.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computations of MPHF index on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_mphffast_time << " seconds" << endl;


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        modelCanonical.getHash(nodes.item().kmer.get<Type>());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computing hash1 of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hash_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        modelCanonical.getHash2(nodes.item().kmer.get<Type>());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computing hash2 of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hash_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        modelCanonical.getHash2(nodesFast.item().kmer);
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computing hash2 of kmers on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hashfast_time << " seconds" << endl;


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        modelCanonical.EMPHFhash(nodes.item().kmer.get<Type>());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computing EMPHFhash of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hash_time << " seconds" << endl;
    // it's slow. i don't understand why. see above for the "confuse mphf" part


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.neighborsDummy(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " dummy neighbors() query on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl;




    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.neighbors(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " neighbors() query on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.neighbors(nodesFast.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " neighbors() query on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl;


/* isBranching */
    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.isBranching(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " isBranching() query on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.isBranching(nodesFast.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " isBranching() query on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl;



    
    /* now, compute adjacency! */



    graph.precomputeAdjacency();
    graphFast.precomputeAdjacency();

    cout << "adjacency precomputed" << endl;

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.neighbors(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " neighbors() query on all nodes (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.neighbors(nodesFast.item());
    end_t=chrono::system_clock::now();
    cout << "time to do " << nodes.size() << " fast neighbors() query on all NodeFast (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl;
    
    /* isBranching */

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.isBranching(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " isBranching() query on all nodes (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.isBranching(nodesFast.item());
    end_t=chrono::system_clock::now();
    cout << "time to do " << nodes.size() << " fast isBranching() query on all NodeFast (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl;
    


    /** We remove the graph. */
    //graph.remove ();
    //graphFast.remove (); // no actually, I want to keep the .h5 file


}