void IterativeExtensions<span, Node, Edge, GraphDataVariant>::buildSequence ( const Node& node, const Path_t<Node>& consensusRight, size_t nbNodes, size_t depth, Sequence& seq ) { /** Shortcuts. */ Data& data = seq.getData(); size_t lenRight = consensusRight.size(); /** We compute the total size of the sequence. */ size_t fullLength = graph.getKmerSize() + consensusRight.size(); /* we need this in mapsembler: the first used kmer should be a k-1 mer. Indeed, if a first kmer is extracted from a sequence : * -------------****** (k=6), then this node is the one linked to a new one starting with ******, * thus with an overlap of k and not k-1. */ size_t offset = (depth == 0 && dont_output_first_nucleotide) ? 1 : 0; /** We compute the total size of the sequence. */ size_t length = fullLength - offset; /** We set the comment of the sequence. */ seq.setComment (Stringify::format ("%lli__len__%i__depth__%i", nbNodes, fullLength, depth) ); /** We set the data length. */ seq.getData().resize (length); size_t idx=0; size_t i = offset; /** We dump the starting node. */ string nodeStr = graph.toString (node); for ( ; i<nodeStr.size(); i++) { data[idx++] = nodeStr[i]; } /** We dump the right part. */ for (i=0; i<lenRight; i++) { data[idx++] = ascii (consensusRight[i]); } }
int main (int argc, char* argv[]) { const size_t SPAN = KMER_SPAN(1); /** Shortcuts. */ typedef Kmer<SPAN>::Type Type; typedef Kmer<SPAN>::Count Count; typedef Kmer<SPAN>::ModelCanonical ModelCanon; typedef Kmer<SPAN>::ModelMinimizer <ModelCanon> Model; size_t kmerSize = 33; size_t mmerSize = 11; /** We create a command line parser. */ OptionsParser parser ("GraphStats"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true)); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); string filename = options->getStr (STR_URI_INPUT); /** We create the solid kmers. */ Graph graph = Graph::create ("-in %s -kmer-size %d -bloom none -out toto.h5 -abundance-min 1", filename.c_str(), kmerSize); /** We get the information of the solid kmers from the HDF5 file. */ Storage* storage = StorageFactory(STORAGE_HDF5).load ("toto.h5"); LOCAL (storage); Group& dskGroup = storage->getGroup("dsk"); /** We get the solid kmers partition. */ Partition<Count>& partition = dskGroup.getPartition<Count> ("solid"); /** We create two kmers models. */ Model model (kmerSize, mmerSize); Model modelK1 (kmerSize-1, mmerSize); // We declare an output Bank BankBinary outputBank (System::file().getBaseName(filename) + ".bin"); /** We create a sequence with BINARY encoding. */ Sequence seq (Data::ASCII); /** We get an iterator over the [kmer,abundance] of solid kmers. */ Iterator<Count>* it = partition.iterator(); LOCAL (it); /** We iterate the solid kmers. */ for (it->first(); !it->isDone(); it->next()) { Type current = it->item().value; cout << "kmer=" << it->item().value << " minimizer=" << model.getMinimizerValue(current) << " abundance=" << it->item().abundance << endl; /** We interpret the kmer value as a Data object. */ Data data (Data::BINARY); data.setRef ((char*) ¤t, model.getKmerSize()); modelK1.iterate (data, [&] (const Model::Kmer& k, size_t idx) { /** Shortcut. */ Type miniminizerCurrent = k.minimizer().value(); cout << "-> " << k.value() << " minimizer=" << miniminizerCurrent << " " << modelK1.getMmersModel().toString (miniminizerCurrent) << endl; string tmp = modelK1.getMmersModel().toString (miniminizerCurrent); /** We interpret the minimizer value as a Data object. */ seq.getData().setRef ((char*)tmp.c_str(), modelK1.getMmersModel().getKmerSize()); /** We insert the sequence into the binary bank. */ outputBank.insert (seq); }); } /** We flush the output bank. */ outputBank.flush(); /** We iterate the output bank. */ outputBank.iterate ([&] (const Sequence& s) { /** We get the kmer corresponding to the current sequence. */ ModelCanon::Kmer mini = modelK1.getMmersModel().codeSeed (s.getDataBuffer(), Data::BINARY); cout << "mini=" << mini.value() << " " << modelK1.getMmersModel().toString (mini.value()) << endl; }); } catch (OptionFailure& e) { return e.displayErrors (std::cout); } catch (Exception& e) { std::cerr << "EXCEPTION: " << e.getMessage() << std::endl; } }
void operator() (Sequence& seq){ int used_windows_size=windows_size; if (windows_size==0 || windows_size>seq.getDataSize()){ // if windows size == 0 then we use the full read length as windows used_windows_size=seq.getDataSize(); } if(not keep_low_complexity and not is_high_complexity(seq,kmer_size)){return;} bool exists; unsigned char count; itKmer->setData (seq.getData()); vector<int> values; // For each position: number of occurrences of the kmer starting at this position. // vector<int> covered_positions; // DEPRECATED [OLD WAY FOR COMPUTING SHARE KMER POSITIONS, FASTER BUT NON IMPLEMENTED WITH WINDOWS SIZE METHODS (max_populated_window)] vector<bool> position_shared = vector<bool>(seq.getDataSize()); // boolean vector. A position is true if it's covered by at least a shared kmer. for (int pos=0;pos<seq.getDataSize();pos++) position_shared[pos]=false; int position=0; for (itKmer->first(); !itKmer->isDone(); itKmer->next()){ quasiDico->get_value((*itKmer)->value().getVal(),exists,count); if(!exists) { count=0; } values.push_back(count); // if (count>0) covered_positions.push_back(position); // DEPRECATED [OLD WAY FOR COMPUTING SHARE KMER POSITIONS, FASTER BUT NON IMPLEMENTED WITH WINDOWS SIZE METHODS (max_populated_window)] if (count>0) { // TODO: OPTIMIZABLE. for (int pos=position;pos<position+kmer_size && pos<=seq.getDataSize();pos++) position_shared[pos]=true; } position++; } // float percentage_shared_positions = 100*number_positions_covered_shared_kmer(covered_positions, seq.getDataSize())/float(seq.getDataSize()); // DEPRECATED [OLD WAY FOR COMPUTING SHARE KMER POSITIONS, FASTER BUT NON IMPLEMENTED WITH WINDOWS SIZE METHODS (max_populated_window)] const int mpw = max_populated_window(position_shared,used_windows_size); const float percentage_span_kmer = 100*mpw/float(used_windows_size); // if (percentage_shared_positions !=percentage_span_kmer){cout<<percentage_shared_positions<< " == " <<percentage_span_kmer<<" ?"<<endl; exit(1);} // TO REMOVE float mean; int median, min, max; if(mean_median_min_max(values, mean, median, min, max)){ string toPrint (to_string(seq.getIndex())+" "+to_string(mean)+" "+to_string(median)+" "+to_string(min)+" "+to_string(max)+" "+to_string(percentage_span_kmer)); // toPrint.append(" "); // for(int i=0;i<seq.getDataSize() ;i++){ // // if (position_shared[i]) { // toPrint.append("1"); // } // else toPrint.append("0"); // } toPrint.append("\n"); if (percentage_span_kmer>=threshold) { synchro->lock(); // bv->set(seq.getIndex()); fwrite(toPrint.c_str(), sizeof(char), toPrint.size(), outFile); synchro->unlock (); } synchro->unlock (); } else{ string toPrint (to_string(seq.getIndex())+" none\n"); synchro->lock(); fwrite(toPrint.c_str(), sizeof(char), toPrint.size(), outFile); synchro->unlock (); } }
void GlueCommander::output(string seq) { Sequence s (Data::ASCII); s.getData().setRef ((char*)seq.c_str(), seq.size()); out->insert(s); }