예제 #1
0
	void parseArgs(){

		_options = getInput();

		//_sketchSize = _options->getInt(STR_SIMKA_SKETCH_SIZE);
	    _nbCores = _options->getInt(STR_NB_CORES);
		_inputFilename1 = _options->getStr(STR_SIMKA_URI_INPUT_1);
		_inputFilename2 = _options->getStr(STR_SIMKA_URI_INPUT_2);
		_outputDir = _options->getStr(STR_URI_OUTPUT);

		_start_i = _options->getInt("-start-i");
		_start_j = _options->getInt("-start-j");
		_n_i = _options->getInt("-n-i");
		_n_j = _options->getInt("-n-j");

		//_kmerSize = _options->getInt(STR_KMER_SIZE);


		if(!System::file().doesExist(_outputDir)){
			int ok = System::file().mkdir(_outputDir, -1);
			if(ok != 0){
				  std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl;
				  exit(1);
			}
		}
	}
예제 #2
0
파일: bankgen.cpp 프로젝트: GATB/gatb-core
int main (int argc, char* argv[])
{
    /** We create a command line parser. */
    OptionsParser parser ("bankgen");

    const char* OUTPUT_PREFIX = "-out";
    const char* SEQ_LEN       = "-seq-len";
    const char* READ_LEN      = "-read-len";
    const char* OVERLAP_LEN   = "-overlap-len";
    const char* COVERAGE      = "-coverage";

    parser.push_back (new OptionOneParam (OUTPUT_PREFIX,  "output prefix",               true));
    parser.push_back (new OptionOneParam (SEQ_LEN,        "sequence length",             false,  "1000000"));
    parser.push_back (new OptionOneParam (READ_LEN,       "read length",                 false,  "150" ));
    parser.push_back (new OptionOneParam (OVERLAP_LEN,    "overlap between two reads",   false,  "50" ));
    parser.push_back (new OptionOneParam (COVERAGE,       "coverage",                    false,  "3" ));

    try
    {
        /** We parse the user options. */
        IProperties* options = parser.parse (argc, argv);

        /** We create the random sequence. */
        IBank* randomBank = new BankRandom (1, options->getInt(SEQ_LEN));
        LOCAL (randomBank);

        /** We create the reads bank. */
        IBank* readsBank = new BankSplitter (
            randomBank,
            options->getInt(READ_LEN),
            options->getInt(OVERLAP_LEN),
            options->getInt(COVERAGE)
        );
        LOCAL (readsBank);

        /** We save the random bank. */
        SaveAsFasta (randomBank, options->getStr(OUTPUT_PREFIX) + "_sequence.fa");

        /** We save the reads bank. */
        SaveAsFasta (readsBank, options->getStr(OUTPUT_PREFIX) + "_reads.fa");
    }
    catch (OptionFailure& e)
    {
        e.getParser().displayErrors (stdout);
        e.getParser().displayHelp   (stdout);
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}
예제 #3
0
int main (int argc, char* argv[])
{
    /** We create a command line parser. */
    OptionsParser parser ("GraphStats");
    parser.push_back (new OptionOneParam (STR_URI_GRAPH, "graph input",  true));
    parser.push_back (new OptionOneParam (STR_NB_CORES,  "nb cores",     false, "0"));

    try
    {
        /** We parse the user options. */
        IProperties* options = parser.parse (argc, argv);

        // We load the graph
        Graph graph = Graph::load (options->getStr(STR_URI_GRAPH));

        // We set the number of cores to be used. Use all available cores if set to 0.
        size_t nbCores = options->getInt(STR_NB_CORES);

        // We get an iterator for branching nodes of the graph.
        // We use a progress iterator to get some progress feedback
        ProgressGraphIterator<BranchingNode,ProgressTimer>  itBranching (graph.iterator<BranchingNode>(), "statistics");

        // We define some kind of unique identifier for a couple (indegree,outdegree)
        typedef pair<size_t,size_t> InOut_t;

        // We want to gather some statistics during the iteration.
        // Note the use of ThreadObject: this object will be cloned N times (one object per thread) and each clone will
        // be reachable within the iteration block through ThreadObject::operator()
        ThreadObject <map <InOut_t, size_t> > topology;

        // We dispatch the iteration on several cores. Note the usage of lambda expression here.
        IDispatcher::Status status = Dispatcher(nbCores).iterate (itBranching, [&] (const BranchingNode& node)
        {
            // We retrieve the current instance of map <InOut_t,size_t> for the current running thread.
            map <InOut_t,size_t>& localTopology = topology();

            // We get branching nodes neighbors for the current branching node.
            Graph::Vector<BranchingEdge> successors   = graph.successors  <BranchingEdge> (node);
            Graph::Vector<BranchingEdge> predecessors = graph.predecessors<BranchingEdge> (node);

            // We increase the occurrences number for the current couple (in/out) neighbors
            localTopology [make_pair(predecessors.size(), successors.size())] ++;
        });

        // Now, the parallel processing is done. We want now to aggregate the information retrieved
        // in each thread in a single map.

        // We get each map<InOut_t,size_t> object filled in each thread, and we add its data into the "global" map.
        // The global map is reachable through the ThreadObject::operator*. The "topology.foreach" will loop over
        // all cloned object used in the threads.
        topology.foreach ([&] (const map <InOut_t, size_t>& t)
        {
            // We update the occurrence of the current couple (in/out)
            for_each (t.begin(), t.end(), [&] (const pair<InOut_t, size_t>& p) { (*topology)[p.first] += p.second;  });
        });

        // We sort the statistics by decreasing occurrence numbers. Since map have its own ordering, we need to put all
        // the data into a vector and sort it with our own sorting criteria.
        vector < pair<InOut_t,size_t> >  stats;
        for (auto it = topology->begin(); it != topology->end(); it++)  { stats.push_back (*it); }
        sort (stats.begin(), stats.end(), [=] (const pair<InOut_t,size_t>& a, const pair<InOut_t,size_t>& b) { return a.second > b.second; });

        printf ("\nThere are %d branching nodes with the following distribution: \n", itBranching.size());

        size_t sum=0;
        for (size_t i=0; i<stats.size(); i++)
        {
            sum += stats[i].second;

            printf ("    [in=%d out=%d]  nb=%7d  percent=%5.2f  distrib=%5.2f\n",
                stats[i].first.first,
                stats[i].first.second,
                stats[i].second,
                100.0*(float)stats[i].second / (float)itBranching.size(),
                100.0*(float)sum             / (float)itBranching.size()
            );
        }

        printf ("\nDone on %d cores in %.2f sec\n\n", status.nbCores, (float)status.time/1000.0);
    }
    catch (OptionFailure& e)
    {
        return e.displayErrors (std::cout);
    }
    catch (Exception& e)
    {
        std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
    }

    return EXIT_SUCCESS;
}
예제 #4
0
int main (int argc, char* argv[])
{
    /** We create a command line parser. */
    OptionsParser parser ("KmerTest");
    parser.push_back (new OptionOneParam (STR_URI_INPUT,      "bank input",     true));
    parser.push_back (new OptionOneParam (STR_KMER_SIZE,      "kmer size",      true));
    parser.push_back (new OptionOneParam (STR_MINIMIZER_SIZE, "minimizer size", true));
    parser.push_back (new OptionNoParam  (STR_VERBOSE,        "display kmers",  false));

    try
    {
        /** We parse the user options. */
        IProperties* options = parser.parse (argc, argv);

        // We get the kmer and minimizer sizes.
        size_t kmerSize = options->getInt(STR_KMER_SIZE);
        size_t mmerSize = options->getInt(STR_MINIMIZER_SIZE);

        // We define a try/catch block in case some method fails (bad filename for instance)
        u_int64_t nbKmers       = 0;
        bool display = options->get(STR_VERBOSE) != 0;

        // We declare a Bank instance defined by a list of filenames
        IBank* bank = Bank::open (options->getStr(STR_URI_INPUT));
        LOCAL (bank);

        // We declare a kmer model and a minimizer model
        Model model (kmerSize, mmerSize);

        // We get a reference on the minimizer model, which will be useful for dumping
       const ModelMinimizer::Model& modelMinimizer = model.getMmersModel();

        Kmer<span>::Type checksum;
        size_t nbChanged = 0;
        size_t nbInvalid = 0;

        // We define an iterator that encapsulates the sequences iterator with progress feedback
        ProgressIterator<Sequence> iter (*bank, "iterate bank");

        // We loop over sequences.
        for (iter.first(); !iter.isDone(); iter.next())
        {
            // Shortcut
            Sequence& seq = iter.item();

//! [snippet1_iterate]
            // We iterate the kmers (and minimizers) of the current sequence.
            model.iterate (seq.getData(), [&] (const Model::Kmer& kmer, size_t idx)
            {
                nbKmers ++;
                if (kmer.hasChanged() == true)   { nbChanged++;  }
                if (kmer.isValid()    == false)  { nbInvalid++;  }
                checksum += kmer.minimizer().value();
            });
//! [snippet1_iterate]
        }

        cout << "nbKmers   : " << nbKmers   << endl;
        cout << "nbInvalid : " << nbInvalid << endl;
        cout << "nbChanged : " << nbChanged << endl;
        cout << "ratio     : " << (nbChanged > 0 ? (double)nbKmers / (double)nbChanged : 0) << endl;
        cout << "checksum  : " << checksum  << endl;
    }
    catch (OptionFailure& e)
    {
        return e.displayErrors (std::cout);
    }
    catch (Exception& e)
    {
        std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
    }

    return EXIT_SUCCESS;
}
예제 #5
0
int main (int argc, char* argv[])
{
    /** We create a command line parser. */
    OptionsParser parser ("BankSplitter");
    parser.push_back (new OptionOneParam (STR_URI_INPUT,      "bank reference",            true));
    parser.push_back (new OptionOneParam (STR_MAX_INPUT_SIZE, "average db size per split", true));
    parser.push_back (new OptionOneParam (STR_URI_OUTPUT_DIR, "output directory",          false, "."));
    parser.push_back (new OptionNoParam  (STR_OUTPUT_FASTQ,   "fastq output",              false));
    parser.push_back (new OptionNoParam  (STR_OUTPUT_GZ,      "gzip output",               false));

    // We define a try/catch block in case some method fails (bad filename for instance)
    try
    {
        /** We parse the user options. */
        IProperties* options = parser.parse (argc, argv);

        /** Shortcuts. */
        u_int64_t maxDbSize = options->getInt(STR_MAX_INPUT_SIZE);

        // We declare an input Bank
        IBank* inputBank = Bank::open (options->getStr(STR_URI_INPUT));
        LOCAL (inputBank);

        // We get the basename of the input bank.
        string inputBasename = System::file().getBaseName (options->getStr(STR_URI_INPUT));

        /** We set the name of the output directory. */
        stringstream ss;  ss << inputBasename << "_S" << maxDbSize;
        string outputDirName = ss.str();

        /** We create the output directory. */
        string outputDir = options->getStr(STR_URI_OUTPUT_DIR) + "/" + outputDirName;
        System::file().mkdir (outputDir, S_IRWXU);

        // We create the album bank.
        BankAlbum album (outputDir + "/album.txt");

        /** We get estimations about the bank. */
        u_int64_t number, totalSize, maxSize;
        inputBank->estimate (number, totalSize, maxSize);

        u_int64_t estimationNbSeqToIterate = number;

        // We create an iterator over the input bank
        ProgressIterator<Sequence> itSeq (*inputBank, "split");

        // We loop over sequences to get the exact number of sequences.
          int64_t nbBanksOutput = -1;
        u_int64_t nbSequences   =  0;
        u_int64_t dbSize        = ~0;

        bool isFastq   = options->get(STR_OUTPUT_FASTQ) != 0;
        bool isGzipped = options->get(STR_OUTPUT_GZ)    != 0;

        IBank* currentBank = 0;

        for (itSeq.first(); !itSeq.isDone(); itSeq.next())
        {
            if (dbSize > maxDbSize)
            {
                if (currentBank != 0)  { currentBank->flush();  currentBank->finalize(); }

                nbBanksOutput ++;

                /** We build the uri of the current bank. */
                stringstream ss;  ss << inputBasename << "_" << nbBanksOutput << (isFastq ? ".fastq" : ".fasta");
                if (isGzipped) { ss << ".gz"; }

                /** We create a new bank and put it in the album. */
                currentBank = album.addBank (outputDir, ss.str(), isFastq, isGzipped);

                /** We reinit the db size counter. */
                dbSize = 0;
            }

            dbSize += itSeq->getDataSize();

            /** We insert the sequence into the current output bank. */
            currentBank->insert (*itSeq);
        }

        if (currentBank != 0)  { currentBank->flush(); }
    }
    catch (OptionFailure& e)
    {
        return e.displayErrors (cout);
    }
    catch (Exception& e)
    {
        cerr << "EXCEPTION: " << e.getMessage() << endl;
    }
}