Exemplo n.º 1
0
void IterativeExtensions<span, Node, Edge, GraphDataVariant>::buildSequence (
    const Node&     node,
    const Path_t<Node>&     consensusRight,
    size_t          nbNodes,
    size_t          depth,
    Sequence&       seq
)
{
    /** Shortcuts. */
    Data&  data     = seq.getData();
    size_t lenRight = consensusRight.size();

    /** We compute the total size of the sequence. */
    size_t fullLength = graph.getKmerSize() + consensusRight.size();

    /* we need this in mapsembler: the first used kmer should be a k-1 mer. Indeed, if a first kmer is extracted from a sequence :
     * -------------****** (k=6), then this node is the one linked to a new one starting with ******,
     * thus with an overlap of k and not k-1.
     */
    size_t offset = (depth == 0 && dont_output_first_nucleotide) ? 1 : 0;

    /** We compute the total size of the sequence. */
    size_t length = fullLength - offset;

    /** We set the comment of the sequence. */
    seq.setComment (Stringify::format ("%lli__len__%i__depth__%i", nbNodes, fullLength, depth) );

    /** We set the data length. */
    seq.getData().resize (length);

    size_t idx=0;
    size_t i = offset;

    /** We dump the starting node. */
    string nodeStr = graph.toString (node);
    for ( ; i<nodeStr.size(); i++)  { data[idx++] = nodeStr[i]; }

    /** We dump the right part. */
    for (i=0; i<lenRight; i++)  {  data[idx++] = ascii (consensusRight[i]); }
}
Exemplo n.º 2
0
int main (int argc, char* argv[])
{
    const size_t SPAN = KMER_SPAN(1);

    /** Shortcuts. */
    typedef Kmer<SPAN>::Type  Type;
    typedef Kmer<SPAN>::Count Count;
    typedef Kmer<SPAN>::ModelCanonical ModelCanon;
    typedef Kmer<SPAN>::ModelMinimizer <ModelCanon> Model;

    size_t kmerSize = 33;
    size_t mmerSize = 11;

    /** We create a command line parser. */
    OptionsParser parser ("GraphStats");
    parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input",  true));

    try
    {
        /** We parse the user options. */
        IProperties* options = parser.parse (argc, argv);

        string filename = options->getStr (STR_URI_INPUT);

        /** We create the solid kmers. */
        Graph graph = Graph::create ("-in %s -kmer-size %d  -bloom none -out toto.h5  -abundance-min 1", filename.c_str(), kmerSize);

        /** We get the information of the solid kmers from the HDF5 file. */
        Storage* storage = StorageFactory(STORAGE_HDF5).load ("toto.h5");   LOCAL (storage);
        Group& dskGroup = storage->getGroup("dsk");

        /** We get the solid kmers partition. */
        Partition<Count>& partition = dskGroup.getPartition<Count> ("solid");

        /** We create two kmers models. */
        Model  model   (kmerSize,   mmerSize);
        Model  modelK1 (kmerSize-1, mmerSize);

        // We declare an output Bank
        BankBinary outputBank (System::file().getBaseName(filename) + ".bin");

        /** We create a sequence with BINARY encoding. */
        Sequence seq (Data::ASCII);

        /** We get an iterator over the [kmer,abundance] of solid kmers. */
        Iterator<Count>* it = partition.iterator();   LOCAL (it);

        /** We iterate the solid kmers. */
        for (it->first(); !it->isDone(); it->next())
        {
            Type current = it->item().value;

            cout << "kmer=" << it->item().value << "  minimizer=" << model.getMinimizerValue(current)
                << "  abundance=" << it->item().abundance << endl;

            /** We interpret the kmer value as a Data object. */
            Data data (Data::BINARY);
            data.setRef ((char*) &current, model.getKmerSize());

            modelK1.iterate (data, [&] (const Model::Kmer& k, size_t idx)
            {
                /** Shortcut. */
                Type miniminizerCurrent = k.minimizer().value();

                cout << "-> " << k.value()
                     << " minimizer=" << miniminizerCurrent << " "
                     << modelK1.getMmersModel().toString (miniminizerCurrent)
                     << endl;

                string tmp = modelK1.getMmersModel().toString (miniminizerCurrent);

                /** We interpret the minimizer value as a Data object. */
                seq.getData().setRef ((char*)tmp.c_str(), modelK1.getMmersModel().getKmerSize());

                /** We insert the sequence into the binary bank. */
                outputBank.insert (seq);
            });
        }

        /** We flush the output bank. */
        outputBank.flush();

        /** We iterate the output bank. */
        outputBank.iterate ([&] (const Sequence& s)
        {
            /** We get the kmer corresponding to the current sequence. */
            ModelCanon::Kmer mini = modelK1.getMmersModel().codeSeed (s.getDataBuffer(), Data::BINARY);

            cout << "mini=" << mini.value() << "  " << modelK1.getMmersModel().toString (mini.value()) << endl;
        });
    }
    catch (OptionFailure& e)
    {
        return e.displayErrors (std::cout);
    }
    catch (Exception& e)
    {
        std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
    }
}
Exemplo n.º 3
0
	void operator() (Sequence& seq){
        int used_windows_size=windows_size;
        if (windows_size==0 || windows_size>seq.getDataSize()){                                           // if windows size == 0 then we use the full read length as windows
            used_windows_size=seq.getDataSize();
        }
        if(not keep_low_complexity and not is_high_complexity(seq,kmer_size)){return;}
		bool exists;
		unsigned char count;
		itKmer->setData (seq.getData());
		vector<int> values;                                                                             // For each position: number of occurrences of the kmer starting at this position.
//        vector<int> covered_positions; // DEPRECATED [OLD WAY FOR COMPUTING SHARE KMER POSITIONS, FASTER BUT NON IMPLEMENTED WITH WINDOWS SIZE METHODS (max_populated_window)]
        vector<bool> position_shared =  vector<bool>(seq.getDataSize());                                // boolean vector. A position is true if it's covered by at least a shared kmer.
        for (int pos=0;pos<seq.getDataSize();pos++) position_shared[pos]=false;
        
        int position=0;
		for (itKmer->first(); !itKmer->isDone(); itKmer->next()){
			quasiDico->get_value((*itKmer)->value().getVal(),exists,count);
			if(!exists) {
                count=0;
            }
            values.push_back(count);
//            if (count>0) covered_positions.push_back(position); // DEPRECATED [OLD WAY FOR COMPUTING SHARE KMER POSITIONS, FASTER BUT NON IMPLEMENTED WITH WINDOWS SIZE METHODS (max_populated_window)]
            if (count>0) { // TODO: OPTIMIZABLE.
                for (int pos=position;pos<position+kmer_size && pos<=seq.getDataSize();pos++) position_shared[pos]=true;
            }
            position++;
		}
        
//        float percentage_shared_positions = 100*number_positions_covered_shared_kmer(covered_positions, seq.getDataSize())/float(seq.getDataSize());  // DEPRECATED [OLD WAY FOR COMPUTING SHARE KMER POSITIONS, FASTER BUT NON IMPLEMENTED WITH WINDOWS SIZE METHODS (max_populated_window)]
        const int mpw = max_populated_window(position_shared,used_windows_size);
        const float percentage_span_kmer = 100*mpw/float(used_windows_size);
        
//        if (percentage_shared_positions !=percentage_span_kmer){cout<<percentage_shared_positions<< " == " <<percentage_span_kmer<<" ?"<<endl; exit(1);} // TO REMOVE
        
        
		float mean;
		int median, min, max;
		if(mean_median_min_max(values, mean, median, min, max)){
           
			string toPrint (to_string(seq.getIndex())+" "+to_string(mean)+" "+to_string(median)+" "+to_string(min)+" "+to_string(max)+" "+to_string(percentage_span_kmer));
            
//            toPrint.append(" ");
//            for(int i=0;i<seq.getDataSize() ;i++){
//                
//                if (position_shared[i]) {
//                    toPrint.append("1");
//                }
//                else toPrint.append("0");
//            }
            
            toPrint.append("\n");

            if (percentage_span_kmer>=threshold) {
                synchro->lock();
                //                bv->set(seq.getIndex());
                fwrite(toPrint.c_str(), sizeof(char), toPrint.size(), outFile);
                synchro->unlock ();

            }
			synchro->unlock ();
		}

		else{
			string toPrint (to_string(seq.getIndex())+" none\n");
			synchro->lock();
			fwrite(toPrint.c_str(), sizeof(char), toPrint.size(), outFile);
			synchro->unlock ();
		}
        
	}
Exemplo n.º 4
0
void GlueCommander::output(string seq)
{
	Sequence s (Data::ASCII);
	s.getData().setRef ((char*)seq.c_str(), seq.size());
	out->insert(s);
}