int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("BankFilter"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank reference", true)); parser.push_back (new OptionOneParam (STR_URI_SEQ_IDS, "file holding indexes of bank", true)); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); /** We read the list of indexes. */ set<size_t> indexes; FILE* file = fopen (options->getStr(STR_URI_SEQ_IDS).c_str(), "r"); if (file != 0) { char buffer[128]; while (fgets (buffer, sizeof(buffer), file)) { indexes.insert (atoi(buffer)); } fclose (file); } cout << "found " << indexes.size() << " indexes" << endl; /** We open the output bank. */ string outputBankUri = options->getStr(STR_URI_INPUT) + "_" + System::file().getBaseName (options->getStr(STR_URI_SEQ_IDS)); IBank* outputBank = Bank::open (outputBankUri); LOCAL (outputBank); /** We loop the input bank. */ IBank* inputBank = Bank::open (options->getStr(STR_URI_INPUT)); LOCAL (inputBank); /** We use another iterator for filtering out some sequences. */ FilterIterator<Sequence,FilterFunctor> itSeq (inputBank->iterator(), FilterFunctor(indexes)); /** We loop the sequences. */ for (itSeq.first(); !itSeq.isDone(); itSeq.next()) { outputBank->insert (itSeq.item()); } /** We flush the output bank. */ outputBank->flush(); } catch (OptionFailure& e) { return e.displayErrors (cout); } catch (Exception& e) { cerr << "EXCEPTION: " << e.getMessage() << endl; } }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("BankFilter"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank input", true)); parser.push_back (new OptionOneParam (STR_FILTER_RATIO, "skip a sequence if 'good letters number / seq.len > X'", false, "0.8")); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); /** Shortcuts. */ double percentThreshold = options->getDouble(STR_FILTER_RATIO); /** We open the input bank. */ IBank* inBank = Bank::open (options->getStr(STR_URI_INPUT)); LOCAL (inBank); /** We create the output inBank. */ IBank* outBank = new BankFasta (options->getStr(STR_URI_INPUT) + "_filtered"); LOCAL (outBank); /** We iterate the inBank. NOTE: WE USE A LAMBDA EXPRESSION HERE. */ inBank->iterate ([&] (Sequence& s) { /** Shortcut. */ char* data = s.getDataBuffer(); size_t nbOK = 0; for (size_t i=0; i<s.getDataSize(); i++) { if (data[i]=='A' || data[i]=='C' || data[i]=='G' || data[i]=='T') { nbOK++; } } if ((double)nbOK / (double)s.getDataSize() > percentThreshold) { outBank->insert (s); } }); /** We flush the output bank. */ outBank->flush(); } catch (OptionFailure& e) { return e.displayErrors (cout); } catch (Exception& e) { cerr << "EXCEPTION: " << e.getMessage() << endl; } }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("BankSplitter"); parser.push_back (new OptionOneParam (STR_URI_INPUT, "bank reference", true)); parser.push_back (new OptionOneParam (STR_MAX_INPUT_SIZE, "average db size per split", true)); parser.push_back (new OptionOneParam (STR_URI_OUTPUT_DIR, "output directory", false, ".")); parser.push_back (new OptionNoParam (STR_OUTPUT_FASTQ, "fastq output", false)); parser.push_back (new OptionNoParam (STR_OUTPUT_GZ, "gzip output", false)); // We define a try/catch block in case some method fails (bad filename for instance) try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); /** Shortcuts. */ u_int64_t maxDbSize = options->getInt(STR_MAX_INPUT_SIZE); // We declare an input Bank IBank* inputBank = Bank::open (options->getStr(STR_URI_INPUT)); LOCAL (inputBank); // We get the basename of the input bank. string inputBasename = System::file().getBaseName (options->getStr(STR_URI_INPUT)); /** We set the name of the output directory. */ stringstream ss; ss << inputBasename << "_S" << maxDbSize; string outputDirName = ss.str(); /** We create the output directory. */ string outputDir = options->getStr(STR_URI_OUTPUT_DIR) + "/" + outputDirName; System::file().mkdir (outputDir, S_IRWXU); // We create the album bank. BankAlbum album (outputDir + "/album.txt"); /** We get estimations about the bank. */ u_int64_t number, totalSize, maxSize; inputBank->estimate (number, totalSize, maxSize); u_int64_t estimationNbSeqToIterate = number; // We create an iterator over the input bank ProgressIterator<Sequence> itSeq (*inputBank, "split"); // We loop over sequences to get the exact number of sequences. int64_t nbBanksOutput = -1; u_int64_t nbSequences = 0; u_int64_t dbSize = ~0; bool isFastq = options->get(STR_OUTPUT_FASTQ) != 0; bool isGzipped = options->get(STR_OUTPUT_GZ) != 0; IBank* currentBank = 0; for (itSeq.first(); !itSeq.isDone(); itSeq.next()) { if (dbSize > maxDbSize) { if (currentBank != 0) { currentBank->flush(); currentBank->finalize(); } nbBanksOutput ++; /** We build the uri of the current bank. */ stringstream ss; ss << inputBasename << "_" << nbBanksOutput << (isFastq ? ".fastq" : ".fasta"); if (isGzipped) { ss << ".gz"; } /** We create a new bank and put it in the album. */ currentBank = album.addBank (outputDir, ss.str(), isFastq, isGzipped); /** We reinit the db size counter. */ dbSize = 0; } dbSize += itSeq->getDataSize(); /** We insert the sequence into the current output bank. */ currentBank->insert (*itSeq); } if (currentBank != 0) { currentBank->flush(); } } catch (OptionFailure& e) { return e.displayErrors (cout); } catch (Exception& e) { cerr << "EXCEPTION: " << e.getMessage() << endl; } }