Esempio n. 1
0
template<size_t span> struct debruijn_minim_bench {  void operator ()  (Parameter params)
{
    size_t kmerSize = params.k;
    const Graph& graph = params.graph;

    int miniSize = 8;
    int NB_REPETITIONS = 2000000;

    double unit = 1000000000;
    cout.setf(ios_base::fixed);
    cout.precision(3);

    GraphIterator<Node> nodes = graph.iterator ();

    cout << "graph has " << nodes.size() << " nodes" << endl;

    if (nodes.size() == 0)
        exit(1);

    nodes.first ();

    /** We get the first node. */
    Node node = nodes.item();

    typedef typename Kmer<span>::Type  Type;
    typedef typename Kmer<span>::ModelCanonical  ModelCanonical;
    typedef typename Kmer<span>::ModelDirect     ModelDirect;
    typedef typename Kmer<span>::template ModelMinimizer <ModelCanonical>   ModelMini;
    typedef typename ModelMini::Kmer                        KmerType;

    ModelMini  modelMini (kmerSize, miniSize);
    
    /** We get the value of the current minimizer. */
    
    Type kmer = node.kmer.get<Type>();
    
    auto start_t=chrono::system_clock::now();
    for (unsigned int i = 0 ; i < NB_REPETITIONS ; i++)
        modelMini.getMinimizerValue(kmer, false);
    auto end_t=chrono::system_clock::now();
			
    cout << NB_REPETITIONS << " minimizers of length " << miniSize << " on a " << kmerSize << "-mer : " << diff_wtime(start_t, end_t) / unit << " seconds" << endl;
	
    cout << "---- now on all nodes of the graph -----\n";
    int times = max((int)(NB_REPETITIONS / nodes.size()), 1);


    /* compute a baseline */

    start_t=chrono::system_clock::now();
     for (int i=0; i < times; i++)
        for (nodes.first(); !nodes.isDone(); nodes.next())
            modelMini.getMinimizerValueDummy(nodes.item().kmer.get<Type>());
    end_t=chrono::system_clock::now();
    auto baseline_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead (" << nodes.size() << " nodes, " << times << " times) : " << baseline_time << " seconds" << endl;

    /* existing code */

    start_t=chrono::system_clock::now();
    for (int i=0; i < times; i++)
        for (nodes.first(); !nodes.isDone(); nodes.next())
            modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), false);
    end_t=chrono::system_clock::now();
    cout << nodes.size() << " minimizers of length " << miniSize << " on all nodes (" << kmerSize << "-mers), " << to_string(times) << " times, with existing code : " << (diff_wtime(start_t, end_t) / unit) - baseline_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (int i=0; i < times; i++)
        for (nodes.first(); !nodes.isDone(); nodes.next())
            modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), true);
    end_t=chrono::system_clock::now();
    cout << nodes.size() << " minimizers of length " << miniSize << " on all nodes (" << kmerSize << "-mers), " << to_string(times) << " times, with new method    : " << (diff_wtime(start_t, end_t) / unit) - baseline_time << " seconds" << endl;
    //cout << modelMini._invalidMinimizersCounter << "/" << modelMini._minimizersCounter << " normal/fast minimizer computations" << endl,

   /* checking agreement between old and new method*/
    
    cout << "checking agreement... ";
    for (nodes.first(); !nodes.isDone(); nodes.next())
    {
        if (modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), false) != modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), true))
        {
            cout << "FAIL! problem with kmer " << graph.toString(nodes.item()) << " : (old) " << modelMini.getMinimizerString(nodes.item().kmer.get<Type>(), false) << " vs (new) " << modelMini.getMinimizerString(nodes.item().kmer.get<Type>(), true) << endl;
            cout << "debug: integer representation of new minimizer: " << modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), true) << endl;

            exit(1);
        }

        if (modelMini.getMinimizerPosition(nodes.item().kmer.get<Type>(), false) != modelMini.getMinimizerPosition(nodes.item().kmer.get<Type>(), true))
        {
            cout << "FAIL! problem with minimizer positions of kmer " << graph.toString(nodes.item()) << " : (old position) " << modelMini.getMinimizerPosition(nodes.item().kmer.get<Type>(), false) << " vs (new position) " << modelMini.getMinimizerPosition(nodes.item().kmer.get<Type>(), true) << endl;
            cout << " minimizer: " << modelMini.getMinimizerString(nodes.item().kmer.get<Type>()) << endl;

            exit(1);
        }

    }
    cout << "all good." << endl;
}
Esempio n. 2
0
template<size_t span> struct debruijn_mphf_bench {  void operator ()  (Parameter params)
{
    typedef NodeFast<span> NodeFastT;
    typedef GraphTemplate<NodeFastT,EdgeFast<span>,GraphDataVariantFast<span>> GraphFast;

    size_t kmerSize = params.k;
    
    Graph graph; 
    GraphFast graphFast;
  
    if (params.seq == "") 
    {
        graph = Graph::create (params.args.c_str());
        graphFast = GraphFast::create (params.args.c_str());
    }
    else
    {
        graph = Graph::create (new BankStrings (params.seq.c_str(), 0), params.args.c_str());
        graphFast = GraphFast::create (new BankStrings (params.seq.c_str(), 0), params.args.c_str());

    }

    cout << "graph built, benchmarking.." << endl;

    
    int miniSize = 8;
    int NB_REPETITIONS = 2000000;

    double unit = 1000000000;
    cout.setf(ios_base::fixed);
    cout.precision(3);

    Graph::Iterator<Node> nodes = graph.iterator();
    typename GraphFast::template Iterator<NodeFastT> nodesFast = graphFast.iterator();
    nodes.first ();

    /** We get the first node. */
    Node node = nodes.item();

    typedef typename Kmer<span>::Type  Type;
    typedef typename Kmer<span>::ModelCanonical  ModelCanonical;
    typedef typename Kmer<span>::ModelDirect     ModelDirect;
    typedef typename Kmer<span>::template ModelMinimizer <ModelCanonical>   ModelMini;
    typedef typename ModelMini::Kmer                        KmerType;

    ModelMini  modelMini (kmerSize, miniSize);
    ModelCanonical  modelCanonical (kmerSize);

     // for some reason.. if *compiled*, this code confuses makes later MPHF queries 3x slower. really? yes. try to replace "if (confuse_mphf)" by "if (confuse_mphf && 0)" and re-run, you will see.
    {
        bool confuse_mphf = false;
        if (confuse_mphf)
        {
            //Type b; b.setVal(0); 
            //modelCanonical.emphf_hasher(modelCanonical.adaptor(b)); 
            //typedef std::pair<u_int8_t const*, u_int8_t const*> byte_range_t;

            //int c = 0; 
            //byte_range_t brange( reinterpret_cast <u_int8_t const*> (&c), reinterpret_cast <u_int8_t const*> (&c) + 2 );
            //byte_range_t brange( (u_int8_t const*) 1,(u_int8_t const*)33);
            //auto hashes = modelCanonical.empfh_hasher(brange);
        }


        for (int i = 0; i < 0; i++)
        {
            auto start_tt=chrono::system_clock::now();
            for (nodes.first(); !nodes.isDone(); nodes.next())
                modelCanonical.EMPHFhash(nodes.item().kmer.get<Type>());
            auto end_tt=chrono::system_clock::now();
            cout << "time to do " << nodes.size() << " computing EMPHFhash of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_tt, end_tt) / unit) << " seconds" << endl;
        }
        // it's slow. i don't understand why. see above for the "confuse mphf" part
        //return; //FIXME
    }


    /** We get the value of the first node (just an example, it's not used later). */
    Type kmer = node.kmer.get<Type>();
    
    auto start_t=chrono::system_clock::now();
    auto end_t=chrono::system_clock::now();
			
   cout << "----\non all nodes of the graph\n-----\n";

    /* disable node state (because we don't want to pay the price for overhea of checking whether a node is deleted or not in contain() */
   std::cout<< "PAY ATTENTION: this neighbor() benchmark, in the Bloom flavor, is without performing a MPHF query for each found node" << std::endl; 

   graph.disableNodeState(); 
   graphFast.disableNodeState(); 

   /* compute baseline times (= overheads we're not interested in) */

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
    {}
    end_t=chrono::system_clock::now();
    auto baseline_graph_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph nodes enumeration (" << nodes.size() << " nodes) : " << baseline_graph_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
    {}
    end_t=chrono::system_clock::now();
    auto baseline_graphfast_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph NodeFast enumeration (" << nodes.size() << " nodes) : " << baseline_graphfast_time << " seconds" << endl;


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        modelMini.getMinimizerValueDummy(nodes.item().kmer.get<Type>());
    end_t=chrono::system_clock::now();
    auto baseline_minim_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph nodes enumeration and minimizer computation setup (" << nodes.size() << " nodes) : " << baseline_minim_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        nodes.item().getKmer<Type>();
    end_t=chrono::system_clock::now();
    auto baseline_hash_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph nodes enumeration and hash computation setup (" << nodes.size() << " nodes) : " << baseline_hash_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
       nodesFast.item().kmer;
    end_t=chrono::system_clock::now();
    auto baseline_hashfast_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph NodeFast enumeration and hash computation setup (" << nodes.size() << " nodes) : " << baseline_hashfast_time << " seconds" << endl;


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.nodeMPHFIndexDummy(nodes.item());
    end_t=chrono::system_clock::now();
    auto baseline_mphf_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph nodes enumeration and mphf query setup (" << nodes.size() << " nodes) : " << baseline_mphf_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.nodeMPHFIndexDummy(nodesFast.item());
    end_t=chrono::system_clock::now();
    auto baseline_mphffast_time = diff_wtime(start_t, end_t) / unit;
    cout << "baseline overhead for graph NodeFast enumeration and mphf query setup (" << nodes.size() << " nodes) : " << baseline_mphffast_time << " seconds" << endl;


    /* do actual benchmark */


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), true);
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computations of minimizers (fast method) of length " << miniSize << " on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_minim_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.nodeMPHFIndex(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computations of MPHF index on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_mphf_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.nodeMPHFIndex(nodesFast.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computations of MPHF index on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_mphffast_time << " seconds" << endl;


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        modelCanonical.getHash(nodes.item().kmer.get<Type>());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computing hash1 of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hash_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        modelCanonical.getHash2(nodes.item().kmer.get<Type>());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computing hash2 of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hash_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        modelCanonical.getHash2(nodesFast.item().kmer);
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computing hash2 of kmers on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hashfast_time << " seconds" << endl;


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        modelCanonical.EMPHFhash(nodes.item().kmer.get<Type>());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " computing EMPHFhash of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hash_time << " seconds" << endl;
    // it's slow. i don't understand why. see above for the "confuse mphf" part


    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.neighborsDummy(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " dummy neighbors() query on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl;




    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.neighbors(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " neighbors() query on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.neighbors(nodesFast.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " neighbors() query on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl;


/* isBranching */
    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.isBranching(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " isBranching() query on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.isBranching(nodesFast.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " isBranching() query on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl;



    
    /* now, compute adjacency! */



    graph.precomputeAdjacency();
    graphFast.precomputeAdjacency();

    cout << "adjacency precomputed" << endl;

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.neighbors(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " neighbors() query on all nodes (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.neighbors(nodesFast.item());
    end_t=chrono::system_clock::now();
    cout << "time to do " << nodes.size() << " fast neighbors() query on all NodeFast (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl;
    
    /* isBranching */

    start_t=chrono::system_clock::now();
    for (nodes.first(); !nodes.isDone(); nodes.next())
        graph.isBranching(nodes.item());
    end_t=chrono::system_clock::now();

    cout << "time to do " << nodes.size() << " isBranching() query on all nodes (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl;

    start_t=chrono::system_clock::now();
    for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next())
        graphFast.isBranching(nodesFast.item());
    end_t=chrono::system_clock::now();
    cout << "time to do " << nodes.size() << " fast isBranching() query on all NodeFast (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl;
    


    /** We remove the graph. */
    //graph.remove ();
    //graphFast.remove (); // no actually, I want to keep the .h5 file


}