Пример #1
0
int main (int argc, char* argv[])
{
    // We get the file name from the user arguments
    const char* filename = argc >= 2 ? argv[1] : "";

    // We get information about the bank.
    u_int64_t nbSequences=0, dataSize=0, seqMaxSize=0, seqMinSize=~0;

    // We declare a Bank instance.
    IBank* bank = Bank::open (filename);
    LOCAL (bank);

    // IN A NEAR FUTURE, WE WILL HAVE STL LIKE ITERATORS.
#if 0
    for (BankFasta::iterator it = bank->begin(); it != bank->end(); ++it)
    {
        Sequence& seq = *it;

        Data& data = seq.getData();

        nbSequences ++;
        if (data.size() > seqMaxSize)  { seqMaxSize = data.size(); }
        if (data.size() < seqMinSize)  { seqMinSize = data.size(); }
        dataSize += data.size ();
    }
#endif

    std::cout << "data size         : " << dataSize     << std::endl;
    std::cout << "sequence number   : " << nbSequences  << std::endl;
    std::cout << "sequence max size : " << seqMaxSize   << std::endl;
    std::cout << "sequence min size : " << seqMinSize   << std::endl;
}
Пример #2
0
int main (int argc, char* argv[])
{
    /** We create a command line parser. */
    OptionsParser parser ("BankFilter");
    parser.push_back (new OptionOneParam (STR_URI_INPUT,    "bank reference",               true));
    parser.push_back (new OptionOneParam (STR_URI_SEQ_IDS,  "file holding indexes of bank", true));

    try
    {
        /** We parse the user options. */
        IProperties* options = parser.parse (argc, argv);

        /** We read the list of indexes. */
        set<size_t> indexes;
        FILE* file = fopen (options->getStr(STR_URI_SEQ_IDS).c_str(), "r");
        if (file != 0)
        {
            char buffer[128];
            while (fgets (buffer, sizeof(buffer), file))  {  indexes.insert (atoi(buffer));  }
            fclose (file);
        }

        cout << "found " << indexes.size() << " indexes" << endl;

        /** We open the output bank. */
        string outputBankUri = options->getStr(STR_URI_INPUT) + "_" + System::file().getBaseName (options->getStr(STR_URI_SEQ_IDS));
        IBank* outputBank = Bank::open (outputBankUri);
        LOCAL (outputBank);

        /** We loop the input bank. */
        IBank* inputBank = Bank::open (options->getStr(STR_URI_INPUT));
        LOCAL (inputBank);

        /** We use another iterator for filtering out some sequences. */
        FilterIterator<Sequence,FilterFunctor> itSeq (inputBank->iterator(), FilterFunctor(indexes));

        /** We loop the sequences. */
        for (itSeq.first(); !itSeq.isDone(); itSeq.next())
        {
            outputBank->insert (itSeq.item());
        }

        /** We flush the output bank. */
        outputBank->flush();
    }

    catch (OptionFailure& e)
    {
        return e.displayErrors (cout);
    }
    catch (Exception& e)
    {
        cerr << "EXCEPTION: " << e.getMessage() << endl;
    }
}
Пример #3
0
int main (int argc, char* argv[])
{
    /** We create a command line parser. */
    OptionsParser parser ("BankFilter");
    parser.push_back (new OptionOneParam (STR_URI_INPUT,     "bank input",   true));
    parser.push_back (new OptionOneParam (STR_FILTER_RATIO,  "skip a sequence if 'good letters number / seq.len > X'",   false, "0.8"));

    try
    {
        /** We parse the user options. */
        IProperties* options = parser.parse (argc, argv);

        /** Shortcuts. */
        double percentThreshold = options->getDouble(STR_FILTER_RATIO);

        /** We open the input bank. */
        IBank* inBank = Bank::open (options->getStr(STR_URI_INPUT));
        LOCAL (inBank);

        /** We create the output inBank. */
        IBank* outBank = new BankFasta (options->getStr(STR_URI_INPUT) + "_filtered");
        LOCAL (outBank);

        /** We iterate the inBank. NOTE: WE USE A LAMBDA EXPRESSION HERE. */
        inBank->iterate ([&] (Sequence& s)
        {
            /** Shortcut. */
            char* data = s.getDataBuffer();

            size_t nbOK = 0;
            for (size_t i=0; i<s.getDataSize(); i++)
            {
                if (data[i]=='A' || data[i]=='C' || data[i]=='G' || data[i]=='T')  { nbOK++; }
            }

            if ((double)nbOK / (double)s.getDataSize() > percentThreshold)  {  outBank->insert (s);  }
        });

        /** We flush the output bank. */
        outBank->flush();
    }

    catch (OptionFailure& e)
    {
        return e.displayErrors (cout);
    }
    catch (Exception& e)
    {
        cerr << "EXCEPTION: " << e.getMessage() << endl;
    }
}
Пример #4
0
int main (int argc, char* argv[])
{
    /** We create a command line parser. */
    OptionsParser parser ("BankStats");
    parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input",   true));

    try
    {
        /** We parse the user options. */
        IProperties* options = parser.parse (argc, argv);

        std::string filename = options->getStr(STR_URI_INPUT);

        //! [snippet16_bank]
        // We get an instance of IBank from the URI.
        IBank* bank = Bank::open (filename);

        //! [snippet16_seq]
        // We create an iterator on the bank
        Iterator<Sequence>* it = bank->iterator();

        // We iterate the sequences of the bank
        for (it->first(); !it->isDone(); it->next())
        {
            // We get a shortcut on the current sequence and its data
            Sequence& seq  = it->item();
            Data&     data = seq.getData();

            // We dump some information about the sequence.
            std::cout << "comment " << seq.getComment() << std::endl;

            // We dump each nucleotide. NOTE: the output depends on the data encoding
            for (size_t i=0; i<data.size(); i++)  {  std::cout << data[i];  }  std::cout << std::endl;
        }

        //! [snippet16_seq]
        // The bank and the iterator have been allocated on the heap, so we have to delete them
        delete it;
        delete bank;
        //! [snippet16_bank]
    }
    catch (OptionFailure& e)
    {
        return e.displayErrors (std::cout);
    }
    catch (Exception& e)
    {
        std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
    }
}
Пример #5
0
int main (int argc, char* argv[])
{
    if (argc < 2)
    {
        std::cerr << "you must provide a bank." << std::endl;
        return EXIT_FAILURE;
    }

    // We declare an input Bank and use it locally
    IBank* inputBank = Bank::open (argv[1]);
    LOCAL (inputBank);

    // We create an iterator over this bank.
    Iterator<Sequence>* it = inputBank->iterator();
    LOCAL (it);

    // We loop over sequences in a "push" fashion (a functor is called for each sequence)
    Functor fct;
    it->iterate (fct);
}
Пример #6
0
// START Application
int main (int argc, char* argv[])
{
  // We check that the user provides at least one option: a Fasta/FastQ file.
  // Online GATB-Tutorial: this argument is automatically filled in with an 
  // appropriate file.
  if (argc < 2)
  {
    std::cerr << "Please, provide a sequence file." << std::endl;
    return EXIT_FAILURE;
  }

  // We define a try/catch block in case some method fails (bad filename for instance)
  try
  {
    // We declare an input Bank and use it locally
    IBank* inputBank = Bank::open (argv[1]);
    LOCAL (inputBank);

    // We create an iterator over this bank using some filtering system
    FilterIterator<Sequence,QualityFilter> it (inputBank->iterator(), QualityFilter());
    
    // We loop over sequences.
    for (it.first(); !it.isDone(); it.next())
    {
      // Shortcut
      Sequence& seq = it.item();

      // We dump the sequence quality
      std::cout << "[" << seq.getQuality() << "] " << computeMeanPhredScore(seq.getQuality()) << std::endl;

    }
  }
  catch (Exception& e)
  {
    std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
  }
}
Пример #7
0
int main (int argc, char* argv[])
{
    if (argc < 2)
    {
        std::cerr << "you must provide a bank." << std::endl;
        return EXIT_FAILURE;
    }

    // We define a try/catch block in case some method fails
    try
    {
        // We declare an input Bank and use it locally
        IBank* inputBank = Bank::open (argv[1]);
        LOCAL (inputBank);

        // Note also that we have to parameterize the SubjectIterator by the kind of iterated
        // items (Sequence) and the processing that has to be done on each iteration (ProgressFunctor).
        SubjectIterator<Sequence> iter (inputBank->iterator(), 10);

        //  We create some listener to be notified every 10 iterations and attach it to the iterator.
        iter.addObserver (new ProgressFunctor());

        // We loop over sequences.
        for (iter.first(); !iter.isDone(); iter.next())
        {
            // Note that we do nothing inside the sequence iterating loop about the progression management.

            // In other words, we don't "pollute" the code inside this loop by presentation concerns and
            // we can therefore focus on the job to be done on the iterated sequences.
        }
    }
    catch (Exception& e)
    {
        std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
    }
}
Пример #8
0
int main (int argc, char* argv[])
{
    /** We create a command line parser. */
    OptionsParser parser ("BankSplitter");
    parser.push_back (new OptionOneParam (STR_URI_INPUT,      "bank reference",            true));
    parser.push_back (new OptionOneParam (STR_MAX_INPUT_SIZE, "average db size per split", true));
    parser.push_back (new OptionOneParam (STR_URI_OUTPUT_DIR, "output directory",          false, "."));
    parser.push_back (new OptionNoParam  (STR_OUTPUT_FASTQ,   "fastq output",              false));
    parser.push_back (new OptionNoParam  (STR_OUTPUT_GZ,      "gzip output",               false));

    // We define a try/catch block in case some method fails (bad filename for instance)
    try
    {
        /** We parse the user options. */
        IProperties* options = parser.parse (argc, argv);

        /** Shortcuts. */
        u_int64_t maxDbSize = options->getInt(STR_MAX_INPUT_SIZE);

        // We declare an input Bank
        IBank* inputBank = Bank::open (options->getStr(STR_URI_INPUT));
        LOCAL (inputBank);

        // We get the basename of the input bank.
        string inputBasename = System::file().getBaseName (options->getStr(STR_URI_INPUT));

        /** We set the name of the output directory. */
        stringstream ss;  ss << inputBasename << "_S" << maxDbSize;
        string outputDirName = ss.str();

        /** We create the output directory. */
        string outputDir = options->getStr(STR_URI_OUTPUT_DIR) + "/" + outputDirName;
        System::file().mkdir (outputDir, S_IRWXU);

        // We create the album bank.
        BankAlbum album (outputDir + "/album.txt");

        /** We get estimations about the bank. */
        u_int64_t number, totalSize, maxSize;
        inputBank->estimate (number, totalSize, maxSize);

        u_int64_t estimationNbSeqToIterate = number;

        // We create an iterator over the input bank
        ProgressIterator<Sequence> itSeq (*inputBank, "split");

        // We loop over sequences to get the exact number of sequences.
          int64_t nbBanksOutput = -1;
        u_int64_t nbSequences   =  0;
        u_int64_t dbSize        = ~0;

        bool isFastq   = options->get(STR_OUTPUT_FASTQ) != 0;
        bool isGzipped = options->get(STR_OUTPUT_GZ)    != 0;

        IBank* currentBank = 0;

        for (itSeq.first(); !itSeq.isDone(); itSeq.next())
        {
            if (dbSize > maxDbSize)
            {
                if (currentBank != 0)  { currentBank->flush();  currentBank->finalize(); }

                nbBanksOutput ++;

                /** We build the uri of the current bank. */
                stringstream ss;  ss << inputBasename << "_" << nbBanksOutput << (isFastq ? ".fastq" : ".fasta");
                if (isGzipped) { ss << ".gz"; }

                /** We create a new bank and put it in the album. */
                currentBank = album.addBank (outputDir, ss.str(), isFastq, isGzipped);

                /** We reinit the db size counter. */
                dbSize = 0;
            }

            dbSize += itSeq->getDataSize();

            /** We insert the sequence into the current output bank. */
            currentBank->insert (*itSeq);
        }

        if (currentBank != 0)  { currentBank->flush(); }
    }
    catch (OptionFailure& e)
    {
        return e.displayErrors (cout);
    }
    catch (Exception& e)
    {
        cerr << "EXCEPTION: " << e.getMessage() << endl;
    }
}