int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("BankFilter"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true)); parser.push_back (new OptionOneParam (STR_FILTER_RATIO, "skip a sequence if 'good letters number / seq.len > X'", false, "0.8")); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); /** Shortcuts. */ double percentThreshold = options->getDouble(STR_FILTER_RATIO); /** We open the input bank. */ IBank* inBank = Bank::open (options->getStr(STR_URI_INPUT)); LOCAL (inBank); /** We create the output inBank. */ IBank* outBank = new BankFasta (options->getStr(STR_URI_INPUT) + "_filtered"); LOCAL (outBank); /** We iterate the inBank. NOTE: WE USE A LAMBDA EXPRESSION HERE. */ inBank->iterate ([&] (Sequence& s) { /** Shortcut. */ char* data = s.getDataBuffer(); size_t nbOK = 0; for (size_t i=0; i<s.getDataSize(); i++) { if (data[i]=='A' || data[i]=='C' || data[i]=='G' || data[i]=='T') { nbOK++; } } if ((double)nbOK / (double)s.getDataSize() > percentThreshold) { outBank->insert (s); } }); /** We flush the output bank. */ outBank->flush(); } catch (OptionFailure& e) { return e.displayErrors (cout); } catch (Exception& e) { cerr << "EXCEPTION: " << e.getMessage() << endl; } }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("bankgen"); const char* OUTPUT_PREFIX = "-out"; const char* SEQ_LEN = "-seq-len"; const char* READ_LEN = "-read-len"; const char* OVERLAP_LEN = "-overlap-len"; const char* COVERAGE = "-coverage"; parser.push_back (new OptionOneParam (OUTPUT_PREFIX, "output prefix", true)); parser.push_back (new OptionOneParam (SEQ_LEN, "sequence length", false, "1000000")); parser.push_back (new OptionOneParam (READ_LEN, "read length", false, "150" )); parser.push_back (new OptionOneParam (OVERLAP_LEN, "overlap between two reads", false, "50" )); parser.push_back (new OptionOneParam (COVERAGE, "coverage", false, "3" )); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); /** We create the random sequence. */ IBank* randomBank = new BankRandom (1, options->getInt(SEQ_LEN)); LOCAL (randomBank); /** We create the reads bank. */ IBank* readsBank = new BankSplitter ( randomBank, options->getInt(READ_LEN), options->getInt(OVERLAP_LEN), options->getInt(COVERAGE) ); LOCAL (readsBank); /** We save the random bank. */ SaveAsFasta (randomBank, options->getStr(OUTPUT_PREFIX) + "_sequence.fa"); /** We save the reads bank. */ SaveAsFasta (readsBank, options->getStr(OUTPUT_PREFIX) + "_reads.fa"); } catch (OptionFailure& e) { e.getParser().displayErrors (stdout); e.getParser().displayHelp (stdout); return EXIT_FAILURE; } return EXIT_SUCCESS; }
TraversalKind getTraversalKind (int argc, char* argv[]) { const char* STR_TRAVERSAL_MODE = "-traversal"; TraversalKind result; // We create a command line parser. OptionsParser parser ("Traversal"); parser.push_back (new OptionOneParam (STR_TRAVERSAL_MODE, "traversal mode ('unitig' or 'contig'", true)); // We retrieve the traversal kind. try { IProperties* props = parser.parse (argc, argv); parse (props->getStr(STR_TRAVERSAL_MODE), result); } catch (OptionFailure& e) { e.displayErrors (std::cout); exit (EXIT_FAILURE); } catch (Exception& e) { cout << e.getMessage() << endl; exit (EXIT_FAILURE); } return result; }
int main (int argc, char* argv[]) { // We create a command line parser. OptionsParser parser ("SortingCount"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "sorting count input", true)); try { // Shortcuts. typedef Kmer<>::Count Count; typedef Kmer<>::Type Type; // We parse the user options. IProperties* options = parser.parse (argc, argv); // We load the object storing the couples [kmer,abundance] Storage* storage = StorageFactory(STORAGE_HDF5).load (options->getStr(STR_URI_INPUT)); LOCAL (storage); // We get the group inside the storage object Group& dskGroup = storage->getGroup("dsk"); // We retrieve the partition holding the couples [kmer,abundance] Partition<Count>& solidKmers = dskGroup.getPartition<Count> ("solid"); // Now, we read the couples in two ways, computing a checksum in each case. Type checksum1, checksum2; // CASE 1: we read the couples [kmer,abundance] with an iterator over the whole partition Iterator<Count>* it = solidKmers.iterator(); LOCAL (it); for (it->first(); !it->isDone(); it->next()) { checksum1 = checksum1 + it->item().value; } // CASE 2: we read the couples [kmer,abundance] with an iterator over each collection of the partition for (size_t i=0; i<solidKmers.size(); i++) { // We get the current collection inside the partition Collection<Count>& collection = solidKmers [i]; Iterator<Count>* it = collection.iterator(); LOCAL (it); for (it->first(); !it->isDone(); it->next()) { checksum2 = checksum2 + it->item().value; } } // We check that we got the same checksum cout << "checksum1=" << checksum1 << endl; cout << "checksum2=" << checksum1 << endl; } catch (OptionFailure& e) { return e.displayErrors (std::cout); } catch (Exception& e) { std::cerr << "EXCEPTION: " << e.getMessage() << std::endl; } return EXIT_SUCCESS; }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("BankFilter"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank reference", true)); parser.push_back (new OptionOneParam (STR_URI_SEQ_IDS, "file holding indexes of bank", true)); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); /** We read the list of indexes. */ set<size_t> indexes; FILE* file = fopen (options->getStr(STR_URI_SEQ_IDS).c_str(), "r"); if (file != 0) { char buffer[128]; while (fgets (buffer, sizeof(buffer), file)) { indexes.insert (atoi(buffer)); } fclose (file); } cout << "found " << indexes.size() << " indexes" << endl; /** We open the output bank. */ string outputBankUri = options->getStr(STR_URI_INPUT) + "_" + System::file().getBaseName (options->getStr(STR_URI_SEQ_IDS)); IBank* outputBank = Bank::open (outputBankUri); LOCAL (outputBank); /** We loop the input bank. */ IBank* inputBank = Bank::open (options->getStr(STR_URI_INPUT)); LOCAL (inputBank); /** We use another iterator for filtering out some sequences. */ FilterIterator<Sequence,FilterFunctor> itSeq (inputBank->iterator(), FilterFunctor(indexes)); /** We loop the sequences. */ for (itSeq.first(); !itSeq.isDone(); itSeq.next()) { outputBank->insert (itSeq.item()); } /** We flush the output bank. */ outputBank->flush(); } catch (OptionFailure& e) { return e.displayErrors (cout); } catch (Exception& e) { cerr << "EXCEPTION: " << e.getMessage() << endl; } }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("BankStats"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true)); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); std::string filename = options->getStr(STR_URI_INPUT); //! [snippet16_bank] // We get an instance of IBank from the URI. IBank* bank = Bank::open (filename); //! [snippet16_seq] // We create an iterator on the bank Iterator<Sequence>* it = bank->iterator(); // We iterate the sequences of the bank for (it->first(); !it->isDone(); it->next()) { // We get a shortcut on the current sequence and its data Sequence& seq = it->item(); Data& data = seq.getData(); // We dump some information about the sequence. std::cout << "comment " << seq.getComment() << std::endl; // We dump each nucleotide. NOTE: the output depends on the data encoding for (size_t i=0; i<data.size(); i++) { std::cout << data[i]; } std::cout << std::endl; } //! [snippet16_seq] // The bank and the iterator have been allocated on the heap, so we have to delete them delete it; delete bank; //! [snippet16_bank] } catch (OptionFailure& e) { return e.displayErrors (std::cout); } catch (Exception& e) { std::cerr << "EXCEPTION: " << e.getMessage() << std::endl; } }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("BankStats"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true)); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); // We get information about the bank. u_int64_t nbSequences=0, dataSize=0, seqMaxSize=0, seqMinSize=~0; // We declare an input Bank and use it locally IBank* inputBank = Bank::open (options->getStr(STR_URI_INPUT)); LOCAL (inputBank); ProgressIterator<Sequence> it (*inputBank, "iterate"); for (it.first(); !it.isDone(); it.next()) { Data& data = it.item().getData(); nbSequences ++; if (data.size() > seqMaxSize) { seqMaxSize = data.size(); } if (data.size() < seqMinSize) { seqMinSize = data.size(); } dataSize += data.size (); } std::cout << "data size : " << dataSize << std::endl; std::cout << "sequence number : " << nbSequences << std::endl; std::cout << "sequence max size : " << seqMaxSize << std::endl; std::cout << "sequence min size : " << seqMinSize << std::endl; } catch (OptionFailure& e) { return e.displayErrors (std::cout); } catch (Exception& e) { std::cerr << "EXCEPTION: " << e.getMessage() << std::endl; } }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("BankDump"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true)); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); /** We dump the bank hierarchy. */ dump (Bank::open (options->getStr(STR_URI_INPUT))); } catch (OptionFailure& e) { return e.displayErrors (std::cout); } catch (Exception& e) { std::cerr << "EXCEPTION: " << e.getMessage() << std::endl; } }
int main (int argc, char* argv[]) { const size_t SPAN = KMER_SPAN(1); /** Shortcuts. */ typedef Kmer<SPAN>::Type Type; typedef Kmer<SPAN>::Count Count; typedef Kmer<SPAN>::ModelCanonical ModelCanon; typedef Kmer<SPAN>::ModelMinimizer <ModelCanon> Model; size_t kmerSize = 33; size_t mmerSize = 11; /** We create a command line parser. */ OptionsParser parser ("GraphStats"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true)); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); string filename = options->getStr (STR_URI_INPUT); /** We create the solid kmers. */ Graph graph = Graph::create ("-in %s -kmer-size %d -bloom none -out toto.h5 -abundance-min 1", filename.c_str(), kmerSize); /** We get the information of the solid kmers from the HDF5 file. */ Storage* storage = StorageFactory(STORAGE_HDF5).load ("toto.h5"); LOCAL (storage); Group& dskGroup = storage->getGroup("dsk"); /** We get the solid kmers partition. */ Partition<Count>& partition = dskGroup.getPartition<Count> ("solid"); /** We create two kmers models. */ Model model (kmerSize, mmerSize); Model modelK1 (kmerSize-1, mmerSize); // We declare an output Bank BankBinary outputBank (System::file().getBaseName(filename) + ".bin"); /** We create a sequence with BINARY encoding. */ Sequence seq (Data::ASCII); /** We get an iterator over the [kmer,abundance] of solid kmers. */ Iterator<Count>* it = partition.iterator(); LOCAL (it); /** We iterate the solid kmers. */ for (it->first(); !it->isDone(); it->next()) { Type current = it->item().value; cout << "kmer=" << it->item().value << " minimizer=" << model.getMinimizerValue(current) << " abundance=" << it->item().abundance << endl; /** We interpret the kmer value as a Data object. */ Data data (Data::BINARY); data.setRef ((char*) ¤t, model.getKmerSize()); modelK1.iterate (data, [&] (const Model::Kmer& k, size_t idx) { /** Shortcut. */ Type miniminizerCurrent = k.minimizer().value(); cout << "-> " << k.value() << " minimizer=" << miniminizerCurrent << " " << modelK1.getMmersModel().toString (miniminizerCurrent) << endl; string tmp = modelK1.getMmersModel().toString (miniminizerCurrent); /** We interpret the minimizer value as a Data object. */ seq.getData().setRef ((char*)tmp.c_str(), modelK1.getMmersModel().getKmerSize()); /** We insert the sequence into the binary bank. */ outputBank.insert (seq); }); } /** We flush the output bank. */ outputBank.flush(); /** We iterate the output bank. */ outputBank.iterate ([&] (const Sequence& s) { /** We get the kmer corresponding to the current sequence. */ ModelCanon::Kmer mini = modelK1.getMmersModel().codeSeed (s.getDataBuffer(), Data::BINARY); cout << "mini=" << mini.value() << " " << modelK1.getMmersModel().toString (mini.value()) << endl; }); } catch (OptionFailure& e) { return e.displayErrors (std::cout); } catch (Exception& e) { std::cerr << "EXCEPTION: " << e.getMessage() << std::endl; } }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser; parser.push_back (new OptionOneParam (STR_URI_INPUT, "graph file", true)); IProperties* params = 0; try { /** We parse the user options. */ params = parser.parse (argc, argv); } catch (OptionFailure& e) { e.getParser().displayErrors (stdout); e.getParser().displayHelp (stdout); return EXIT_FAILURE; } // We create the graph with the bank and other options Graph graph = Graph::load (params->getStr(STR_URI_INPUT)); // We create a graph marker. GraphMarker<BranchingNode> marker (graph); // We create an object for Breadth First Search for the de Bruijn graph. BFS<BranchingNode> bfs (graph); // We want to compute the distribution of connected components of the branching nodes. // - key is a connected component class (for a given number of branching nodes for this component) // - value is the number of times this component class occurs in the branching sub graph map<size_t,Entry> distrib; // We get an iterator for all nodes of the graph. We use a progress iterator to get some progress feedback ProgressGraphIterator<BranchingNode,ProgressTimer> itBranching (graph.iterator<BranchingNode>(), "statistics"); // We want to know the number of connected components size_t nbConnectedComponents = 0; // We define some kind of unique identifier for a couple (indegree,outdegree) map <InOut_t, size_t> topology; size_t simplePathSizeMin = ~0; size_t simplePathSizeMax = 0; // We want time duration of the iteration TimeInfo ti; ti.start ("compute"); // We loop the branching nodes for (itBranching.first(); !itBranching.isDone(); itBranching.next()) { // We get branching nodes neighbors for the current branching node. Graph::Vector<BranchingEdge> successors = graph.successors <BranchingEdge> (*itBranching); Graph::Vector<BranchingEdge> predecessors = graph.predecessors<BranchingEdge> (*itBranching); // We increase the occurrences number for the current couple (in/out) neighbors topology [make_pair(predecessors.size(), successors.size())] ++; // We loop the in/out neighbors and update min/max simple path size for (size_t i=0; i<successors.size(); i++) { simplePathSizeMax = std::max (simplePathSizeMax, successors[i].distance); simplePathSizeMin = std::min (simplePathSizeMin, successors[i].distance); } for (size_t i=0; i<predecessors.size(); i++) { simplePathSizeMax = std::max (simplePathSizeMax, predecessors[i].distance); simplePathSizeMin = std::min (simplePathSizeMin, predecessors[i].distance); } // We skip already visited nodes. if (marker.isMarked (*itBranching)) { continue; } // We launch the breadth first search; we get as a result the set of branching nodes in this component const set<BranchingNode>& component = bfs.run (*itBranching); // We mark the nodes for this connected component marker.mark (component); // We update our distribution distrib[component.size()].nbOccurs += 1; // We update the number of connected components. nbConnectedComponents++; } ti.stop ("compute"); // We compute the total number of branching nodes in all connected components. size_t sumOccurs = 0; size_t sumKmers = 0; for (map<size_t,Entry>::iterator it = distrib.begin(); it != distrib.end(); it++) { sumOccurs += it->first*it->second.nbOccurs; sumKmers += it->second.nbKmers; } // We sort the statistics by decreasing occurrence numbers. Since map have its own ordering, we need to put all // the data into a vector and sort it with our own sorting criteria. vector < pair<InOut_t,size_t> > stats; for (map <InOut_t, size_t>::iterator it = topology.begin(); it != topology.end(); it++) { stats.push_back (*it); } sort (stats.begin(), stats.end(), CompareFct); // Note: it must be equal to the number of branching nodes of the graph assert (sumOccurs == itBranching.size()); // We aggregate the computed information Properties props ("topology"); props.add (1, "graph"); props.add (2, "name", "%s", graph.getName().c_str()); props.add (2, "db_input", "%s", graph.getInfo().getStr("input").c_str()); props.add (2, "db_nb_seq", "%d", graph.getInfo().getInt("sequences_number")); props.add (2, "db_size", "%d", graph.getInfo().getInt("sequences_size")); props.add (2, "kmer_size", "%d", graph.getInfo().getInt("kmer_size")); props.add (2, "kmer_nks", "%d", graph.getInfo().getInt("nks")); props.add (2, "nb_nodes", "%d", graph.getInfo().getInt("kmers_nb_solid")); props.add (2, "nb_branching_nodes", "%d", graph.getInfo().getInt("nb_branching")); props.add (2, "percent_branching_nodes", "%.1f", graph.getInfo().getInt("kmers_nb_solid") > 0 ? 100.0 * (float)graph.getInfo().getInt("nb_branching") / (float) graph.getInfo().getInt("kmers_nb_solid") : 0 ); props.add (1, "branching_nodes"); props.add (2, "simple_path"); props.add (3, "size_min", "%d", simplePathSizeMin); props.add (3, "size_max", "%d", simplePathSizeMax); props.add (2, "neighborhoods"); for (size_t i=0; i<stats.size(); i++) { props.add (3, "neighborhood", "in=%d out=%d", stats[i].first.first, stats[i].first.second); props.add (4, "nb_bnodes", "%d", stats[i].second); props.add (4, "percentage", "%5.2f", itBranching.size() > 0 ? 100.0*(float)stats[i].second / (float)itBranching.size() : 0 ); } props.add (2, "connected_components"); props.add (3, "nb_classes", "%d", distrib.size()); props.add (3, "nb_components", "%d", nbConnectedComponents); for (map<size_t,Entry>::iterator it = distrib.begin(); it!=distrib.end(); it++) { props.add (3, "component_class"); props.add (4, "nb_occurs", "%d", it->second.nbOccurs); props.add (4, "nb_bnodes", "%d", it->first); props.add (4, "freq_bnodes", "%f", sumOccurs > 0 ? 100.0*(float)(it->first*it->second.nbOccurs) / (float)sumOccurs : 0 ); } props.add (1, ti.getProperties("time")); // We dump the results in a XML file in the current directory XmlDumpPropertiesVisitor v (graph.getName() + ".xml", false); props.accept (&v); return EXIT_SUCCESS; }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("GraphStats"); parser.push_back (new OptionOneParam (STR_URI_GRAPH, "graph input", true)); parser.push_back (new OptionOneParam (STR_NB_CORES, "nb cores", false, "0")); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); // We load the graph Graph graph = Graph::load (options->getStr(STR_URI_GRAPH)); // We set the number of cores to be used. Use all available cores if set to 0. size_t nbCores = options->getInt(STR_NB_CORES); // We get an iterator for branching nodes of the graph. // We use a progress iterator to get some progress feedback ProgressGraphIterator<BranchingNode,ProgressTimer> itBranching (graph.iterator<BranchingNode>(), "statistics"); // We define some kind of unique identifier for a couple (indegree,outdegree) typedef pair<size_t,size_t> InOut_t; // We want to gather some statistics during the iteration. // Note the use of ThreadObject: this object will be cloned N times (one object per thread) and each clone will // be reachable within the iteration block through ThreadObject::operator() ThreadObject <map <InOut_t, size_t> > topology; // We dispatch the iteration on several cores. Note the usage of lambda expression here. IDispatcher::Status status = Dispatcher(nbCores).iterate (itBranching, [&] (const BranchingNode& node) { // We retrieve the current instance of map <InOut_t,size_t> for the current running thread. map <InOut_t,size_t>& localTopology = topology(); // We get branching nodes neighbors for the current branching node. Graph::Vector<BranchingEdge> successors = graph.successors <BranchingEdge> (node); Graph::Vector<BranchingEdge> predecessors = graph.predecessors<BranchingEdge> (node); // We increase the occurrences number for the current couple (in/out) neighbors localTopology [make_pair(predecessors.size(), successors.size())] ++; }); // Now, the parallel processing is done. We want now to aggregate the information retrieved // in each thread in a single map. // We get each map<InOut_t,size_t> object filled in each thread, and we add its data into the "global" map. // The global map is reachable through the ThreadObject::operator*. The "topology.foreach" will loop over // all cloned object used in the threads. topology.foreach ([&] (const map <InOut_t, size_t>& t) { // We update the occurrence of the current couple (in/out) for_each (t.begin(), t.end(), [&] (const pair<InOut_t, size_t>& p) { (*topology)[p.first] += p.second; }); }); // We sort the statistics by decreasing occurrence numbers. Since map have its own ordering, we need to put all // the data into a vector and sort it with our own sorting criteria. vector < pair<InOut_t,size_t> > stats; for (auto it = topology->begin(); it != topology->end(); it++) { stats.push_back (*it); } sort (stats.begin(), stats.end(), [=] (const pair<InOut_t,size_t>& a, const pair<InOut_t,size_t>& b) { return a.second > b.second; }); printf ("\nThere are %d branching nodes with the following distribution: \n", itBranching.size()); size_t sum=0; for (size_t i=0; i<stats.size(); i++) { sum += stats[i].second; printf (" [in=%d out=%d] nb=%7d percent=%5.2f distrib=%5.2f\n", stats[i].first.first, stats[i].first.second, stats[i].second, 100.0*(float)stats[i].second / (float)itBranching.size(), 100.0*(float)sum / (float)itBranching.size() ); } printf ("\nDone on %d cores in %.2f sec\n\n", status.nbCores, (float)status.time/1000.0); } catch (OptionFailure& e) { return e.displayErrors (std::cout); } catch (Exception& e) { std::cerr << "EXCEPTION: " << e.getMessage() << std::endl; } return EXIT_SUCCESS; }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("KmerTest"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true)); parser.push_back (new OptionOneParam (STR_KMER_SIZE, "kmer size", true)); parser.push_back (new OptionOneParam (STR_MINIMIZER_SIZE, "minimizer size", true)); parser.push_back (new OptionNoParam (STR_VERBOSE, "display kmers", false)); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); // We get the kmer and minimizer sizes. size_t kmerSize = options->getInt(STR_KMER_SIZE); size_t mmerSize = options->getInt(STR_MINIMIZER_SIZE); // We define a try/catch block in case some method fails (bad filename for instance) u_int64_t nbKmers = 0; bool display = options->get(STR_VERBOSE) != 0; // We declare a Bank instance defined by a list of filenames IBank* bank = Bank::open (options->getStr(STR_URI_INPUT)); LOCAL (bank); // We declare a kmer model and a minimizer model Model model (kmerSize, mmerSize); // We get a reference on the minimizer model, which will be useful for dumping const ModelMinimizer::Model& modelMinimizer = model.getMmersModel(); Kmer<span>::Type checksum; size_t nbChanged = 0; size_t nbInvalid = 0; // We define an iterator that encapsulates the sequences iterator with progress feedback ProgressIterator<Sequence> iter (*bank, "iterate bank"); // We loop over sequences. for (iter.first(); !iter.isDone(); iter.next()) { // Shortcut Sequence& seq = iter.item(); //! [snippet1_iterate] // We iterate the kmers (and minimizers) of the current sequence. model.iterate (seq.getData(), [&] (const Model::Kmer& kmer, size_t idx) { nbKmers ++; if (kmer.hasChanged() == true) { nbChanged++; } if (kmer.isValid() == false) { nbInvalid++; } checksum += kmer.minimizer().value(); }); //! [snippet1_iterate] } cout << "nbKmers : " << nbKmers << endl; cout << "nbInvalid : " << nbInvalid << endl; cout << "nbChanged : " << nbChanged << endl; cout << "ratio : " << (nbChanged > 0 ? (double)nbKmers / (double)nbChanged : 0) << endl; cout << "checksum : " << checksum << endl; } catch (OptionFailure& e) { return e.displayErrors (std::cout); } catch (Exception& e) { std::cerr << "EXCEPTION: " << e.getMessage() << std::endl; } return EXIT_SUCCESS; }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("GraphStats"); parser.push_back (new OptionOneParam (STR_URI_GRAPH, "graph input", true)); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); // We load the graph Graph graph = Graph::load (options->getStr(STR_URI_GRAPH)); // We create a graph marker. GraphMarker marker (graph); // We create an object for Breadth First Search for the de Bruijn graph. BFS bfs (graph); // We want to compute the distribution of connected components of the branching nodes. // - key is a connected component class (for a given number of branching nodes for this component) // - value is the number of times this component class occurs in the branching sub graph map<size_t,size_t> distrib; // We get an iterator for all nodes of the graph. We use a progress iterator to get some progress feedback ProgressGraphIterator<BranchingNode,ProgressTimer> itBranching (graph.iteratorBranching(), "statistics"); // We want time duration of the iteration TimeInfo ti; ti.start ("compute"); // We need to keep each connected component. list<set<BranchingNode> > components; // We loop the branching nodes for (itBranching.first(); !itBranching.isDone(); itBranching.next()) { // We skip already visited nodes. if (marker.isMarked (*itBranching)) { continue; } // We launch the breadth first search; we get as a result the set of branching nodes in this component const set<BranchingNode>& component = bfs.run (*itBranching); // We memorize the component components.push_back (component); // We mark the nodes for this connected component marker.mark (component); // We update our distribution distrib[component.size()] ++; } ti.stop ("compute"); // We compute the total number of branching nodes in all connected components. size_t sum = 0; for (map<size_t,size_t>::iterator it = distrib.begin(); it != distrib.end(); it++) { sum += it->first*it->second; } // Note: it must be equal to the number of branching nodes of the graph assert (sum == itBranching.size()); size_t idx1=0; size_t cc=0; // We check that each component has no intersection with all other components. // Note: this check may take a long time since we have N^2 intersections to compute. for (list<set<BranchingNode> >::iterator it1 = components.begin(); it1 != components.end(); it1++, idx1++) { size_t idx2=0; for (list<set<BranchingNode> >::iterator it2 = components.begin(); it2 != components.end(); it2++, idx2++) { if (it1 != it2) { set<BranchingNode> inter; set_intersection (it1->begin(),it1->end(),it2->begin(),it2->end(), std::inserter(inter,inter.begin())); if (inter.size()!=0) { printf ("ERROR, intersection should be empty...\n"); exit(EXIT_FAILURE); } } if (++cc % 50 == 0) { cc = 0; printf ("[check] %.1f %.1f\r", 100.0*(float)idx1/(float)components.size(), 100.0*(float)idx2/(float)components.size()); fflush (stdout); } } } printf ("\n"); // We aggregate the computed information Properties props ("connected_components"); props.add (1, "graph_name", "%s", graph.getName().c_str()); props.add (1, "nb_branching_nodes", "%d", sum); props.add (1, "nb_connected_components", "%d", distrib.size()); for (map<size_t,size_t>::iterator it = distrib.begin(); it!=distrib.end(); it++) { props.add (2, "component"); props.add (3, "nb_nodes", "%d", it->first); props.add (3, "nb_occurs", "%d", it->second); props.add (3, "freq_nodes", "%f", 100.0*(float)(it->first*it->second) / (float)sum); props.add (3, "freq_occurs", "%f", 100.0*(float)it->second / (float)sum); } props.add (1, ti.getProperties("time")); // We dump the results in a XML file in the current directory XmlDumpPropertiesVisitor v (graph.getName() + ".xml", false); props.accept (&v); } catch (OptionFailure& e) { return e.displayErrors (std::cout); } catch (Exception& e) { std::cerr << "EXCEPTION: " << e.getMessage() << std::endl; } return EXIT_SUCCESS; }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("BankSplitter"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank reference", true)); parser.push_back (new OptionOneParam (STR_MAX_INPUT_SIZE, "average db size per split", true)); parser.push_back (new OptionOneParam (STR_URI_OUTPUT_DIR, "output directory", false, ".")); parser.push_back (new OptionNoParam (STR_OUTPUT_FASTQ, "fastq output", false)); parser.push_back (new OptionNoParam (STR_OUTPUT_GZ, "gzip output", false)); // We define a try/catch block in case some method fails (bad filename for instance) try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); /** Shortcuts. */ u_int64_t maxDbSize = options->getInt(STR_MAX_INPUT_SIZE); // We declare an input Bank IBank* inputBank = Bank::open (options->getStr(STR_URI_INPUT)); LOCAL (inputBank); // We get the basename of the input bank. string inputBasename = System::file().getBaseName (options->getStr(STR_URI_INPUT)); /** We set the name of the output directory. */ stringstream ss; ss << inputBasename << "_S" << maxDbSize; string outputDirName = ss.str(); /** We create the output directory. */ string outputDir = options->getStr(STR_URI_OUTPUT_DIR) + "/" + outputDirName; System::file().mkdir (outputDir, S_IRWXU); // We create the album bank. BankAlbum album (outputDir + "/album.txt"); /** We get estimations about the bank. */ u_int64_t number, totalSize, maxSize; inputBank->estimate (number, totalSize, maxSize); u_int64_t estimationNbSeqToIterate = number; // We create an iterator over the input bank ProgressIterator<Sequence> itSeq (*inputBank, "split"); // We loop over sequences to get the exact number of sequences. int64_t nbBanksOutput = -1; u_int64_t nbSequences = 0; u_int64_t dbSize = ~0; bool isFastq = options->get(STR_OUTPUT_FASTQ) != 0; bool isGzipped = options->get(STR_OUTPUT_GZ) != 0; IBank* currentBank = 0; for (itSeq.first(); !itSeq.isDone(); itSeq.next()) { if (dbSize > maxDbSize) { if (currentBank != 0) { currentBank->flush(); currentBank->finalize(); } nbBanksOutput ++; /** We build the uri of the current bank. */ stringstream ss; ss << inputBasename << "_" << nbBanksOutput << (isFastq ? ".fastq" : ".fasta"); if (isGzipped) { ss << ".gz"; } /** We create a new bank and put it in the album. */ currentBank = album.addBank (outputDir, ss.str(), isFastq, isGzipped); /** We reinit the db size counter. */ dbSize = 0; } dbSize += itSeq->getDataSize(); /** We insert the sequence into the current output bank. */ currentBank->insert (*itSeq); } if (currentBank != 0) { currentBank->flush(); } } catch (OptionFailure& e) { return e.displayErrors (cout); } catch (Exception& e) { cerr << "EXCEPTION: " << e.getMessage() << endl; } }