template<size_t span> struct debruijn_minim_bench { void operator () (Parameter params) { size_t kmerSize = params.k; const Graph& graph = params.graph; int miniSize = 8; int NB_REPETITIONS = 2000000; double unit = 1000000000; cout.setf(ios_base::fixed); cout.precision(3); GraphIterator<Node> nodes = graph.iterator (); cout << "graph has " << nodes.size() << " nodes" << endl; if (nodes.size() == 0) exit(1); nodes.first (); /** We get the first node. */ Node node = nodes.item(); typedef typename Kmer<span>::Type Type; typedef typename Kmer<span>::ModelCanonical ModelCanonical; typedef typename Kmer<span>::ModelDirect ModelDirect; typedef typename Kmer<span>::template ModelMinimizer <ModelCanonical> ModelMini; typedef typename ModelMini::Kmer KmerType; ModelMini modelMini (kmerSize, miniSize); /** We get the value of the current minimizer. */ Type kmer = node.kmer.get<Type>(); auto start_t=chrono::system_clock::now(); for (unsigned int i = 0 ; i < NB_REPETITIONS ; i++) modelMini.getMinimizerValue(kmer, false); auto end_t=chrono::system_clock::now(); cout << NB_REPETITIONS << " minimizers of length " << miniSize << " on a " << kmerSize << "-mer : " << diff_wtime(start_t, end_t) / unit << " seconds" << endl; cout << "---- now on all nodes of the graph -----\n"; int times = max((int)(NB_REPETITIONS / nodes.size()), 1); /* compute a baseline */ start_t=chrono::system_clock::now(); for (int i=0; i < times; i++) for (nodes.first(); !nodes.isDone(); nodes.next()) modelMini.getMinimizerValueDummy(nodes.item().kmer.get<Type>()); end_t=chrono::system_clock::now(); auto baseline_time = diff_wtime(start_t, end_t) / unit; cout << "baseline overhead (" << nodes.size() << " nodes, " << times << " times) : " << baseline_time << " seconds" << endl; /* existing code */ start_t=chrono::system_clock::now(); for (int i=0; i < times; i++) for (nodes.first(); !nodes.isDone(); nodes.next()) modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), false); end_t=chrono::system_clock::now(); cout << nodes.size() << " minimizers of length " << miniSize << " on all nodes (" << kmerSize << "-mers), " << to_string(times) << " times, with existing code : " << (diff_wtime(start_t, end_t) / unit) - baseline_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (int i=0; i < times; i++) for (nodes.first(); !nodes.isDone(); nodes.next()) modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), true); end_t=chrono::system_clock::now(); cout << nodes.size() << " minimizers of length " << miniSize << " on all nodes (" << kmerSize << "-mers), " << to_string(times) << " times, with new method : " << (diff_wtime(start_t, end_t) / unit) - baseline_time << " seconds" << endl; //cout << modelMini._invalidMinimizersCounter << "/" << modelMini._minimizersCounter << " normal/fast minimizer computations" << endl, /* checking agreement between old and new method*/ cout << "checking agreement... "; for (nodes.first(); !nodes.isDone(); nodes.next()) { if (modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), false) != modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), true)) { cout << "FAIL! problem with kmer " << graph.toString(nodes.item()) << " : (old) " << modelMini.getMinimizerString(nodes.item().kmer.get<Type>(), false) << " vs (new) " << modelMini.getMinimizerString(nodes.item().kmer.get<Type>(), true) << endl; cout << "debug: integer representation of new minimizer: " << modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), true) << endl; exit(1); } if (modelMini.getMinimizerPosition(nodes.item().kmer.get<Type>(), false) != modelMini.getMinimizerPosition(nodes.item().kmer.get<Type>(), true)) { cout << "FAIL! problem with minimizer positions of kmer " << graph.toString(nodes.item()) << " : (old position) " << modelMini.getMinimizerPosition(nodes.item().kmer.get<Type>(), false) << " vs (new position) " << modelMini.getMinimizerPosition(nodes.item().kmer.get<Type>(), true) << endl; cout << " minimizer: " << modelMini.getMinimizerString(nodes.item().kmer.get<Type>()) << endl; exit(1); } } cout << "all good." << endl; }
template<size_t span> struct debruijn_mphf_bench { void operator () (Parameter params) { typedef NodeFast<span> NodeFastT; typedef GraphTemplate<NodeFastT,EdgeFast<span>,GraphDataVariantFast<span>> GraphFast; size_t kmerSize = params.k; Graph graph; GraphFast graphFast; if (params.seq == "") { graph = Graph::create (params.args.c_str()); graphFast = GraphFast::create (params.args.c_str()); } else { graph = Graph::create (new BankStrings (params.seq.c_str(), 0), params.args.c_str()); graphFast = GraphFast::create (new BankStrings (params.seq.c_str(), 0), params.args.c_str()); } cout << "graph built, benchmarking.." << endl; int miniSize = 8; int NB_REPETITIONS = 2000000; double unit = 1000000000; cout.setf(ios_base::fixed); cout.precision(3); Graph::Iterator<Node> nodes = graph.iterator(); typename GraphFast::template Iterator<NodeFastT> nodesFast = graphFast.iterator(); nodes.first (); /** We get the first node. */ Node node = nodes.item(); typedef typename Kmer<span>::Type Type; typedef typename Kmer<span>::ModelCanonical ModelCanonical; typedef typename Kmer<span>::ModelDirect ModelDirect; typedef typename Kmer<span>::template ModelMinimizer <ModelCanonical> ModelMini; typedef typename ModelMini::Kmer KmerType; ModelMini modelMini (kmerSize, miniSize); ModelCanonical modelCanonical (kmerSize); // for some reason.. if *compiled*, this code confuses makes later MPHF queries 3x slower. really? yes. try to replace "if (confuse_mphf)" by "if (confuse_mphf && 0)" and re-run, you will see. { bool confuse_mphf = false; if (confuse_mphf) { //Type b; b.setVal(0); //modelCanonical.emphf_hasher(modelCanonical.adaptor(b)); //typedef std::pair<u_int8_t const*, u_int8_t const*> byte_range_t; //int c = 0; //byte_range_t brange( reinterpret_cast <u_int8_t const*> (&c), reinterpret_cast <u_int8_t const*> (&c) + 2 ); //byte_range_t brange( (u_int8_t const*) 1,(u_int8_t const*)33); //auto hashes = modelCanonical.empfh_hasher(brange); } for (int i = 0; i < 0; i++) { auto start_tt=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) modelCanonical.EMPHFhash(nodes.item().kmer.get<Type>()); auto end_tt=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " computing EMPHFhash of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_tt, end_tt) / unit) << " seconds" << endl; } // it's slow. i don't understand why. see above for the "confuse mphf" part //return; //FIXME } /** We get the value of the first node (just an example, it's not used later). */ Type kmer = node.kmer.get<Type>(); auto start_t=chrono::system_clock::now(); auto end_t=chrono::system_clock::now(); cout << "----\non all nodes of the graph\n-----\n"; /* disable node state (because we don't want to pay the price for overhea of checking whether a node is deleted or not in contain() */ std::cout<< "PAY ATTENTION: this neighbor() benchmark, in the Bloom flavor, is without performing a MPHF query for each found node" << std::endl; graph.disableNodeState(); graphFast.disableNodeState(); /* compute baseline times (= overheads we're not interested in) */ start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) {} end_t=chrono::system_clock::now(); auto baseline_graph_time = diff_wtime(start_t, end_t) / unit; cout << "baseline overhead for graph nodes enumeration (" << nodes.size() << " nodes) : " << baseline_graph_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next()) {} end_t=chrono::system_clock::now(); auto baseline_graphfast_time = diff_wtime(start_t, end_t) / unit; cout << "baseline overhead for graph NodeFast enumeration (" << nodes.size() << " nodes) : " << baseline_graphfast_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) modelMini.getMinimizerValueDummy(nodes.item().kmer.get<Type>()); end_t=chrono::system_clock::now(); auto baseline_minim_time = diff_wtime(start_t, end_t) / unit; cout << "baseline overhead for graph nodes enumeration and minimizer computation setup (" << nodes.size() << " nodes) : " << baseline_minim_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) nodes.item().getKmer<Type>(); end_t=chrono::system_clock::now(); auto baseline_hash_time = diff_wtime(start_t, end_t) / unit; cout << "baseline overhead for graph nodes enumeration and hash computation setup (" << nodes.size() << " nodes) : " << baseline_hash_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next()) nodesFast.item().kmer; end_t=chrono::system_clock::now(); auto baseline_hashfast_time = diff_wtime(start_t, end_t) / unit; cout << "baseline overhead for graph NodeFast enumeration and hash computation setup (" << nodes.size() << " nodes) : " << baseline_hashfast_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) graph.nodeMPHFIndexDummy(nodes.item()); end_t=chrono::system_clock::now(); auto baseline_mphf_time = diff_wtime(start_t, end_t) / unit; cout << "baseline overhead for graph nodes enumeration and mphf query setup (" << nodes.size() << " nodes) : " << baseline_mphf_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next()) graphFast.nodeMPHFIndexDummy(nodesFast.item()); end_t=chrono::system_clock::now(); auto baseline_mphffast_time = diff_wtime(start_t, end_t) / unit; cout << "baseline overhead for graph NodeFast enumeration and mphf query setup (" << nodes.size() << " nodes) : " << baseline_mphffast_time << " seconds" << endl; /* do actual benchmark */ start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) modelMini.getMinimizerValue(nodes.item().kmer.get<Type>(), true); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " computations of minimizers (fast method) of length " << miniSize << " on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_minim_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) graph.nodeMPHFIndex(nodes.item()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " computations of MPHF index on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_mphf_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next()) graphFast.nodeMPHFIndex(nodesFast.item()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " computations of MPHF index on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_mphffast_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) modelCanonical.getHash(nodes.item().kmer.get<Type>()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " computing hash1 of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hash_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) modelCanonical.getHash2(nodes.item().kmer.get<Type>()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " computing hash2 of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hash_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next()) modelCanonical.getHash2(nodesFast.item().kmer); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " computing hash2 of kmers on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hashfast_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) modelCanonical.EMPHFhash(nodes.item().kmer.get<Type>()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " computing EMPHFhash of kmers on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_hash_time << " seconds" << endl; // it's slow. i don't understand why. see above for the "confuse mphf" part start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) graph.neighborsDummy(nodes.item()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " dummy neighbors() query on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) graph.neighbors(nodes.item()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " neighbors() query on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next()) graphFast.neighbors(nodesFast.item()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " neighbors() query on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl; /* isBranching */ start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) graph.isBranching(nodes.item()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " isBranching() query on all nodes (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next()) graphFast.isBranching(nodesFast.item()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " isBranching() query on all NodeFast (" << kmerSize << "-mers) : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl; /* now, compute adjacency! */ graph.precomputeAdjacency(); graphFast.precomputeAdjacency(); cout << "adjacency precomputed" << endl; start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) graph.neighbors(nodes.item()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " neighbors() query on all nodes (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next()) graphFast.neighbors(nodesFast.item()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " fast neighbors() query on all NodeFast (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl; /* isBranching */ start_t=chrono::system_clock::now(); for (nodes.first(); !nodes.isDone(); nodes.next()) graph.isBranching(nodes.item()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " isBranching() query on all nodes (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graph_time << " seconds" << endl; start_t=chrono::system_clock::now(); for (nodesFast.first(); !nodesFast.isDone(); nodesFast.next()) graphFast.isBranching(nodesFast.item()); end_t=chrono::system_clock::now(); cout << "time to do " << nodes.size() << " fast isBranching() query on all NodeFast (" << kmerSize << "-mers) using adjacency : " << (diff_wtime(start_t, end_t) / unit) - baseline_graphfast_time << " seconds" << endl; /** We remove the graph. */ //graph.remove (); //graphFast.remove (); // no actually, I want to keep the .h5 file }