Пример #1
0
    /** Create an iterator for the given iterator. If the verbosity is enough, progress bar information
     * can be displayed.
     * \param[in] iter : object to be encapsulated by a potential progress information
     * \param[in] nbIterations : number of iterations to be done.
     * \param[in] message : message used if progress information has to be displayed
     * \return the created iterator.
     */
    template<typename Item> dp::Iterator<Item>* createIterator (dp::Iterator<Item>* iter, size_t nbIterations=0, const char* message=0)
    {
        if (nbIterations > 0 && message != 0)
        {
            //  We create some listener to be notified every 1000 iterations and attach it to the iterator.
            dp::impl::SubjectIterator<Item>* iterSubject = new dp::impl::SubjectIterator<Item> (iter, nbIterations/100);
            iterSubject->addObserver (createIteratorListener (nbIterations, message));

            /** We assign the used iterator to be the subject iterator. */
            iter = iterSubject;
        }

        /** We return the result. */
        return iter;
    }
Пример #2
0
void MPHFAlgorithm<span,Abundance_t,NodeState_t>::execute ()
{
    /** We check whether we can use such a type. */
    if (_buildOrLoad == true)
    {
        /** We need a progress object. */
        tools::dp::IteratorListener* delegate = createIteratorListener(0,"");  LOCAL (delegate);
        setProgress (new ProgressCustom(delegate));

		

		//if MPHF_BOOPHF and verbose 0,  give a null progress to the builder, make it understand the internal progress bar of boophf needs to be removed
		if((typeid(*delegate) == typeid(tools::dp::IteratorListener)))
			setProgress    (0);


        // get number of threads from dispatcher
        unsigned int nbThreads = this->getDispatcher()->getExecutionUnitsNumber();

        /** We build the hash. */
        {   TIME_INFO (getTimeInfo(), "build");
            _abundanceMap->build (*_solidKmers, nbThreads, _progress);
        }

        /** We save the hash object in the dedicated storage group. */
        {   TIME_INFO (getTimeInfo(), "save");
            _dataSize = _abundanceMap->save (_group, _name);
        }

        /** We populate the hash table. */
        populate ();
        
        /** init a clean node state map */
        initNodeStates ();
    }
}
Пример #3
0
void BranchingAlgorithm<span, Node, Edge, Graph>::execute ()
{
    /** We get an iterator over all graph nodes. */
    GraphIterator<Node> itNodes = _graph->Graph::iterator();

    /** We create a custom listener that makes the finish() method, normally called at end of iteration, do nothing this time.
     * => we define our own 'finishPostponed' method that is called when all the information is ok. */
    CustomListener<Count>* listener = new CustomListener<Count> (
        createIteratorListener (itNodes.size(), progressFormat1),
        _branchingCollection
    );

    /** We encapsulate this iterator with a potentially decorated iterated (for progress information). */
    tools::dp::Iterator<Node>* iter = createIterator<Node> (
        itNodes.get(),
        itNodes.size(),
        progressFormat1,
        listener
    );
    LOCAL (iter);

    /** We get a synchronized object on the data handled by functors. */
    ThreadObject <FunctorData<Count,Type> > functorData;

    FunctorNodes<Count,Type, Node, Edge, Graph> functorNodes (this->_graph, functorData);

    /** We iterate the nodes. */
    tools::dp::IDispatcher::Status status = getDispatcher()->iterate (iter, functorNodes);

    /** Now, because we iterated with N threads, we have N vector of branching nodes. (N=nbcores used by the dispatcher)
     *  We need to merge them.
     *  TODO doc: why do they need to be sorted actually?!
     *  The next step are:
     *      1) sort each vector
     *      2) sort/merge the N vectors in the final collection
     */

    /** Step 1 : sorting the N branching nodes vectors (with the dispatcher). */
    vector<tools::dp::ICommand*> sortCmds;
    for (size_t i=0; i<functorData.size(); i++)  {  sortCmds.push_back (new SortCmd<Count> (functorData[i].branchingNodes));  }
    getDispatcher()->dispatchCommands (sortCmds);

    /** Step 2 : sort/merge the N vectors.
     * We use a priority queue for the merge process. */
    typedef typename vector<Count>::iterator BranchingIterator;
    typedef pair<BranchingIterator,BranchingIterator> BranchingIteratorPair;

    /** We use a cache to improve IO performances. */
    CollectionCache<Count> branchingCache (*_branchingCollection, 16*1024, 0);

    Type checksum; checksum.setVal( 0);

    /** We initialize our priority queue. */
    priority_queue <BranchingIteratorPair, vector<BranchingIteratorPair>, Compare<BranchingIteratorPair> > pq;
    for (size_t i=0; i<functorData.size(); i++)
    {
        if (functorData[i].branchingNodes.empty() == false)
        {
            pq.push (make_pair (functorData[i].branchingNodes.begin(), functorData[i].branchingNodes.end()));
        }
    }

    /** We process the merge/sort. */
    while (!pq.empty())
    {
        /** We get the top iterators pair from the priority queue and remove from it. */
        BranchingIteratorPair it = pq.top();
        pq.pop();

        /** We check that the current iterator is not finished. */
        if (it.first != it.second)
        {
            /** We insert the Count object into the final bag. */
            branchingCache.insert (*it.first);

            /** Stats */
            checksum += it.first->value;

            /** We update the priority queue. */
            ++(it.first); if (it.first != it.second)  {  pq.push (it); }
        }
    }

    /** We have to flush the cache to be sure every items is put into the cached collection. */
    branchingCache.flush ();

    /** We call our 'custom' finish method. */
    listener->finishPostponed();

    /** We save the kind in the storage. */
    _storage(getName()).addProperty ("kind", toString(_kind));

    /* print the number of branching nodes (could be important for debugging, if a user experiences a crash and copypastes stdout) */
    //cout << "Graph has " << _branchingCollection->getNbItems() << " branching nodes." << endl;

    /** We gather some statistics. */
    getInfo()->add (1, "stats");
    getInfo()->add (2, "nb_branching", "%ld", _branchingCollection->getNbItems());
    getInfo()->add (2, "percentage",   "%.1f", (itNodes.size() > 0 ? 100.0*(float)_branchingCollection->getNbItems()/(float)itNodes.size() : 0));

    stringstream ss;  ss << checksum;
    getInfo()->add (2, "checksum_branching", "%s", ss.str().c_str());

    if (getInput()->get(STR_TOPOLOGY_STATS) && getInput()->getInt(STR_TOPOLOGY_STATS) > 0)
    {
        /** We get some topological information. */
        for (size_t i=0; i<functorData.size(); i++)
        {
            for (map<InOut_t, size_t>::iterator it = functorData[i].topology.begin();  it != functorData[i].topology.end(); ++it)
            {
                functorData->topology[it->first] += it->second;
            }
        }

        /** We sort the statistics. */
        vector < pair<InOut_t,size_t> >  topologyStats;
        for (map<InOut_t, size_t>::iterator it = functorData->topology.begin();  it != functorData->topology.end(); ++it)  { topologyStats.push_back (*it); }
        sort (topologyStats.begin(), topologyStats.end(), CompareFct);

        getInfo()->add (1, "topology");
        for (size_t i=0; i<topologyStats.size(); i++)
        {
            getInfo()->add (3, "neighborhood", "[in=%ld out=%ld]  nb : %8ld   percent. : %5.2f",
                topologyStats[i].first.first, topologyStats[i].first.second, topologyStats[i].second,
                _branchingCollection->getNbItems() > 0 ?
                100.0*(float)topologyStats[i].second / (float)_branchingCollection->getNbItems() : 0
            );
        }
    }

    getInfo()->add (1, "time");
    getInfo()->add (2, "build", "%.3f", status.time / 1000.0);
}
Пример #4
0
void MPHFAlgorithm<span,Abundance_t,NodeState_t>::populate ()
{
    size_t nb_iterated = 0;
    size_t n = _abundanceMap->size();

    _nb_abundances_above_precision = 0;

    /** We need a progress object. */
    tools::dp::IteratorListener* delegate = createIteratorListener(_solidCounts->getNbItems(),messages[3]);  LOCAL (delegate);
    setProgress (new ProgressCustom(delegate));

    SubjectIterator<Count>* itKmers = new SubjectIterator<Count> (_solidCounts->iterator(), _solidCounts->getNbItems()/100);
    itKmers->addObserver (_progress);
    LOCAL (itKmers);

    // TODO parallize that

	std::vector<int> & _abundanceDiscretization =  _abundanceMap->_abundanceDiscretization ;
	int max_abundance_discrete = _abundanceDiscretization[_abundanceDiscretization.size()-2];
    // set counts and at the same time, test the mphf
    for (itKmers->first(); !itKmers->isDone(); itKmers->next())
    {
        //cout << "kmer: " << itKmers->item().value.toString(21) << std::endl;
        
        /** We get the hash code of the current item. */
        typename AbundanceMap::Hash::Code h = _abundanceMap->getCode (itKmers->item().value);

        /** Little check. */
        if (h >= n) {  throw Exception ("MPHF check: value out of bounds"); }

        /** We get the abundance of the current kmer. */
        int abundance = itKmers->item().abundance;

        if (abundance > max_abundance_discrete)
        {
            _nb_abundances_above_precision++;
            abundance = max_abundance_discrete;
        }

		//get first cell strictly greater than abundance
		std::vector<int>::iterator  up = std::upper_bound(_abundanceDiscretization.begin(), _abundanceDiscretization.end(), abundance);
		up--; // get previous cell
		int idx = up- _abundanceDiscretization.begin() ;
        /** We set the abundance of the current kmer. */
        _abundanceMap->at (h) = idx;

        nb_iterated ++;
    }

    if (nb_iterated != n && n > 3)
    {
        throw Exception ("ERROR during abundance population: itKmers iterated over %d/%d kmers only", nb_iterated, n);
    }

#if 1
    // you know what? let's always test if the MPHF does not have collisions, it won't hurt.
    check ();
#endif

    /** We gather some statistics. */
    getInfo()->add (1, "stats");
    getInfo()->add (2, "nb_keys",               "%ld",  _abundanceMap->size());
    getInfo()->add (2, "data_size",             "%ld",  _dataSize);
    getInfo()->add (2, "bits_per_key",          "%.3f", (float)(_dataSize*8)/(float)_abundanceMap->size());
    getInfo()->add (2, "prec",                  "%d",   MAX_ABUNDANCE);
    getInfo()->add (2, "nb_abund_above_prec",   "%d",   _nb_abundances_above_precision);
    getInfo()->add (1, getTimeInfo().getProperties("time"));
}