/** Create an iterator for the given iterator. If the verbosity is enough, progress bar information * can be displayed. * \param[in] iter : object to be encapsulated by a potential progress information * \param[in] nbIterations : number of iterations to be done. * \param[in] message : message used if progress information has to be displayed * \return the created iterator. */ template<typename Item> dp::Iterator<Item>* createIterator (dp::Iterator<Item>* iter, size_t nbIterations=0, const char* message=0) { if (nbIterations > 0 && message != 0) { // We create some listener to be notified every 1000 iterations and attach it to the iterator. dp::impl::SubjectIterator<Item>* iterSubject = new dp::impl::SubjectIterator<Item> (iter, nbIterations/100); iterSubject->addObserver (createIteratorListener (nbIterations, message)); /** We assign the used iterator to be the subject iterator. */ iter = iterSubject; } /** We return the result. */ return iter; }
void MPHFAlgorithm<span,Abundance_t,NodeState_t>::execute () { /** We check whether we can use such a type. */ if (_buildOrLoad == true) { /** We need a progress object. */ tools::dp::IteratorListener* delegate = createIteratorListener(0,""); LOCAL (delegate); setProgress (new ProgressCustom(delegate)); //if MPHF_BOOPHF and verbose 0, give a null progress to the builder, make it understand the internal progress bar of boophf needs to be removed if((typeid(*delegate) == typeid(tools::dp::IteratorListener))) setProgress (0); // get number of threads from dispatcher unsigned int nbThreads = this->getDispatcher()->getExecutionUnitsNumber(); /** We build the hash. */ { TIME_INFO (getTimeInfo(), "build"); _abundanceMap->build (*_solidKmers, nbThreads, _progress); } /** We save the hash object in the dedicated storage group. */ { TIME_INFO (getTimeInfo(), "save"); _dataSize = _abundanceMap->save (_group, _name); } /** We populate the hash table. */ populate (); /** init a clean node state map */ initNodeStates (); } }
void BranchingAlgorithm<span, Node, Edge, Graph>::execute () { /** We get an iterator over all graph nodes. */ GraphIterator<Node> itNodes = _graph->Graph::iterator(); /** We create a custom listener that makes the finish() method, normally called at end of iteration, do nothing this time. * => we define our own 'finishPostponed' method that is called when all the information is ok. */ CustomListener<Count>* listener = new CustomListener<Count> ( createIteratorListener (itNodes.size(), progressFormat1), _branchingCollection ); /** We encapsulate this iterator with a potentially decorated iterated (for progress information). */ tools::dp::Iterator<Node>* iter = createIterator<Node> ( itNodes.get(), itNodes.size(), progressFormat1, listener ); LOCAL (iter); /** We get a synchronized object on the data handled by functors. */ ThreadObject <FunctorData<Count,Type> > functorData; FunctorNodes<Count,Type, Node, Edge, Graph> functorNodes (this->_graph, functorData); /** We iterate the nodes. */ tools::dp::IDispatcher::Status status = getDispatcher()->iterate (iter, functorNodes); /** Now, because we iterated with N threads, we have N vector of branching nodes. (N=nbcores used by the dispatcher) * We need to merge them. * TODO doc: why do they need to be sorted actually?! * The next step are: * 1) sort each vector * 2) sort/merge the N vectors in the final collection */ /** Step 1 : sorting the N branching nodes vectors (with the dispatcher). */ vector<tools::dp::ICommand*> sortCmds; for (size_t i=0; i<functorData.size(); i++) { sortCmds.push_back (new SortCmd<Count> (functorData[i].branchingNodes)); } getDispatcher()->dispatchCommands (sortCmds); /** Step 2 : sort/merge the N vectors. * We use a priority queue for the merge process. */ typedef typename vector<Count>::iterator BranchingIterator; typedef pair<BranchingIterator,BranchingIterator> BranchingIteratorPair; /** We use a cache to improve IO performances. */ CollectionCache<Count> branchingCache (*_branchingCollection, 16*1024, 0); Type checksum; checksum.setVal( 0); /** We initialize our priority queue. */ priority_queue <BranchingIteratorPair, vector<BranchingIteratorPair>, Compare<BranchingIteratorPair> > pq; for (size_t i=0; i<functorData.size(); i++) { if (functorData[i].branchingNodes.empty() == false) { pq.push (make_pair (functorData[i].branchingNodes.begin(), functorData[i].branchingNodes.end())); } } /** We process the merge/sort. */ while (!pq.empty()) { /** We get the top iterators pair from the priority queue and remove from it. */ BranchingIteratorPair it = pq.top(); pq.pop(); /** We check that the current iterator is not finished. */ if (it.first != it.second) { /** We insert the Count object into the final bag. */ branchingCache.insert (*it.first); /** Stats */ checksum += it.first->value; /** We update the priority queue. */ ++(it.first); if (it.first != it.second) { pq.push (it); } } } /** We have to flush the cache to be sure every items is put into the cached collection. */ branchingCache.flush (); /** We call our 'custom' finish method. */ listener->finishPostponed(); /** We save the kind in the storage. */ _storage(getName()).addProperty ("kind", toString(_kind)); /* print the number of branching nodes (could be important for debugging, if a user experiences a crash and copypastes stdout) */ //cout << "Graph has " << _branchingCollection->getNbItems() << " branching nodes." << endl; /** We gather some statistics. */ getInfo()->add (1, "stats"); getInfo()->add (2, "nb_branching", "%ld", _branchingCollection->getNbItems()); getInfo()->add (2, "percentage", "%.1f", (itNodes.size() > 0 ? 100.0*(float)_branchingCollection->getNbItems()/(float)itNodes.size() : 0)); stringstream ss; ss << checksum; getInfo()->add (2, "checksum_branching", "%s", ss.str().c_str()); if (getInput()->get(STR_TOPOLOGY_STATS) && getInput()->getInt(STR_TOPOLOGY_STATS) > 0) { /** We get some topological information. */ for (size_t i=0; i<functorData.size(); i++) { for (map<InOut_t, size_t>::iterator it = functorData[i].topology.begin(); it != functorData[i].topology.end(); ++it) { functorData->topology[it->first] += it->second; } } /** We sort the statistics. */ vector < pair<InOut_t,size_t> > topologyStats; for (map<InOut_t, size_t>::iterator it = functorData->topology.begin(); it != functorData->topology.end(); ++it) { topologyStats.push_back (*it); } sort (topologyStats.begin(), topologyStats.end(), CompareFct); getInfo()->add (1, "topology"); for (size_t i=0; i<topologyStats.size(); i++) { getInfo()->add (3, "neighborhood", "[in=%ld out=%ld] nb : %8ld percent. : %5.2f", topologyStats[i].first.first, topologyStats[i].first.second, topologyStats[i].second, _branchingCollection->getNbItems() > 0 ? 100.0*(float)topologyStats[i].second / (float)_branchingCollection->getNbItems() : 0 ); } } getInfo()->add (1, "time"); getInfo()->add (2, "build", "%.3f", status.time / 1000.0); }
void MPHFAlgorithm<span,Abundance_t,NodeState_t>::populate () { size_t nb_iterated = 0; size_t n = _abundanceMap->size(); _nb_abundances_above_precision = 0; /** We need a progress object. */ tools::dp::IteratorListener* delegate = createIteratorListener(_solidCounts->getNbItems(),messages[3]); LOCAL (delegate); setProgress (new ProgressCustom(delegate)); SubjectIterator<Count>* itKmers = new SubjectIterator<Count> (_solidCounts->iterator(), _solidCounts->getNbItems()/100); itKmers->addObserver (_progress); LOCAL (itKmers); // TODO parallize that std::vector<int> & _abundanceDiscretization = _abundanceMap->_abundanceDiscretization ; int max_abundance_discrete = _abundanceDiscretization[_abundanceDiscretization.size()-2]; // set counts and at the same time, test the mphf for (itKmers->first(); !itKmers->isDone(); itKmers->next()) { //cout << "kmer: " << itKmers->item().value.toString(21) << std::endl; /** We get the hash code of the current item. */ typename AbundanceMap::Hash::Code h = _abundanceMap->getCode (itKmers->item().value); /** Little check. */ if (h >= n) { throw Exception ("MPHF check: value out of bounds"); } /** We get the abundance of the current kmer. */ int abundance = itKmers->item().abundance; if (abundance > max_abundance_discrete) { _nb_abundances_above_precision++; abundance = max_abundance_discrete; } //get first cell strictly greater than abundance std::vector<int>::iterator up = std::upper_bound(_abundanceDiscretization.begin(), _abundanceDiscretization.end(), abundance); up--; // get previous cell int idx = up- _abundanceDiscretization.begin() ; /** We set the abundance of the current kmer. */ _abundanceMap->at (h) = idx; nb_iterated ++; } if (nb_iterated != n && n > 3) { throw Exception ("ERROR during abundance population: itKmers iterated over %d/%d kmers only", nb_iterated, n); } #if 1 // you know what? let's always test if the MPHF does not have collisions, it won't hurt. check (); #endif /** We gather some statistics. */ getInfo()->add (1, "stats"); getInfo()->add (2, "nb_keys", "%ld", _abundanceMap->size()); getInfo()->add (2, "data_size", "%ld", _dataSize); getInfo()->add (2, "bits_per_key", "%.3f", (float)(_dataSize*8)/(float)_abundanceMap->size()); getInfo()->add (2, "prec", "%d", MAX_ABUNDANCE); getInfo()->add (2, "nb_abund_above_prec", "%d", _nb_abundances_above_precision); getInfo()->add (1, getTimeInfo().getProperties("time")); }