static void driver( const string& infile, EList<string>& infiles, const string& snpfile, const string& htfile, const string& ssfile, const string& exonfile, const string& svfile, const string& outfile, bool packed, int reverse) { initializeCntLut(); initializeCntBit(); EList<FileBuf*> is(MISC_CAT); bool bisulfite = false; RefReadInParams refparams(false, reverse, nsToAs, bisulfite); assert_gt(infiles.size(), 0); if(format == CMDLINE) { // Adapt sequence strings to stringstreams open for input stringstream *ss = new stringstream(); for(size_t i = 0; i < infiles.size(); i++) { (*ss) << ">" << i << endl << infiles[i].c_str() << endl; } FileBuf *fb = new FileBuf(ss); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } else { // Adapt sequence files to ifstreams for(size_t i = 0; i < infiles.size(); i++) { FILE *f = fopen(infiles[i].c_str(), "r"); if (f == NULL) { cerr << "Error: could not open "<< infiles[i].c_str() << endl; throw 1; } FileBuf *fb = new FileBuf(f); assert(fb != NULL); if(fb->peek() == -1 || fb->eof()) { cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl; continue; } assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } } if(is.empty()) { cerr << "Warning: All fasta inputs were empty" << endl; throw 1; } filesWritten.push_back(outfile + ".1." + gfm_ext); filesWritten.push_back(outfile + ".2." + gfm_ext); // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. EList<RefRecord> szs(MISC_CAT); std::pair<size_t, size_t> sztot; { if(verbose) cerr << "Reading reference sizes" << endl; Timer _t(cerr, " Time reading reference sizes: ", verbose); if(!reverse && (writeRef || justRef)) { filesWritten.push_back(outfile + ".3." + gfm_ext); filesWritten.push_back(outfile + ".4." + gfm_ext); sztot = BitPairReference::szsFromFasta(is, outfile, bigEndian, refparams, szs, sanityCheck); } else { sztot = BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck); } } if(justRef) return; assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); assert_gt(szs.size(), 0); // Construct index from input strings and parameters filesWritten.push_back(outfile + ".5." + gfm_ext); filesWritten.push_back(outfile + ".6." + gfm_ext); filesWritten.push_back(outfile + ".7." + gfm_ext); filesWritten.push_back(outfile + ".8." + gfm_ext); TStr s; HGFM<TIndexOffU> hGFM( s, packed, 1, // TODO: maybe not? lineRate, offRate, // suffix-array sampling rate ftabChars, // number of chars in initial arrow-pair calc localOffRate, localFtabChars, nthreads, snpfile, htfile, ssfile, exonfile, svfile, outfile, // basename for .?.ht2 files reverse == 0, // fw !entireSA, // useBlockwise bmax, // block size for blockwise SA builder bmaxMultSqrt, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len noDc? 0 : dcv,// difference-cover period is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanityCheck); // verify results and internal consistency // Note that the Ebwt is *not* resident in memory at this time. To // load it into memory, call ebwt.loadIntoMemory() if(verbose) { // Print Ebwt's vital stats hGFM.gh().print(cerr); } if(sanityCheck) { // Try restoring the original string (if there were // multiple texts, what we'll get back is the joined, // padded string, not a list) hGFM.loadIntoMemory( reverse ? (refparams.reverse == REF_READ_REVERSE) : 0, true, // load SA sample? true, // load ftab? true, // load rstarts? false, false); SString<char> s2; hGFM.restore(s2); hGFM.evictFromMemory(); { SString<char> joinedss = GFM<>::join<SString<char> >( is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed); // pseudo-random number generator seed if(refparams.reverse == REF_READ_REVERSE) { joinedss.reverse(); } assert_eq(joinedss.length(), s2.length()); assert(sstr_eq(joinedss, s2)); } if(verbose) { if(s2.length() < 1000) { cout << "Passed restore check: " << s2.toZBuf() << endl; } else { cout << "Passed restore check: (" << s2.length() << " chars)" << endl; } } } }
static void driver(const string& infile, vector<string>& infiles, const string& outfile, bool reverse = false) { vector<FileBuf*> is; bool bisulfite = false; RefReadInParams refparams(color, reverse ? reverseType : REF_READ_FORWARD, nsToAs, bisulfite); assert_gt(infiles.size(), 0); if(format == CMDLINE) { // Adapt sequence strings to stringstreams open for input stringstream *ss = new stringstream(); for(size_t i = 0; i < infiles.size(); i++) { (*ss) << ">" << i << endl << infiles[i] << endl; } FileBuf *fb = new FileBuf(ss); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } else { // Adapt sequence files to ifstreams for(size_t i = 0; i < infiles.size(); i++) { FILE *f = fopen(infiles[i].c_str(), "rb"); if (f == NULL) { cerr << "Error: could not open "<< infiles[i] << endl; throw 1; } FileBuf *fb = new FileBuf(f); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } } // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. vector<RefRecord> szs; vector<uint32_t> plens; std::pair<size_t, size_t> sztot; { if(verbose) cout << "Reading reference sizes" << endl; Timer _t(cout, " Time reading reference sizes: ", verbose); if(!reverse && (writeRef || justRef)) { // For forward reference, dump it to .3.ebwt and .4.ebwt // files string file3 = outfile + ".3." + gEbwt_ext; string file4 = outfile + ".4." + gEbwt_ext; // Open output stream for the '.3.ebwt' file which will // hold the size records. ofstream fout3(file3.c_str(), ios::binary); if(!fout3.good()) { cerr << "Could not open index file for writing: \"" << file3 << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "Bowtie." << endl; throw 1; } BitpairOutFileBuf bpout(file4.c_str()); // Read in the sizes of all the unambiguous stretches of // the genome into a vector of RefRecords. The input // streams are reset once it's done. writeU<int32_t>(fout3, 1, bigEndian); // endianness sentinel if(color) { refparams.color = false; // Make sure the .3.ebwt and .4.ebwt files contain // nucleotides; not colors TIndexOff numSeqs = 0; fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs); refparams.color = true; writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records for(size_t i = 0; i < szs.size(); i++) { szs[i].write(fout3, bigEndian); } szs.clear(); plens.clear(); // Now read in the colorspace size records; these are // the ones that were indexed TIndexOff numSeqs2 = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs2); assert_geq(numSeqs, numSeqs2); } else { TIndexOff numSeqs = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs); writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, bigEndian); } if(sztot.first == 0) { cerr << "Error: No unambiguous stretches of characters in the input. Aborting..." << endl; throw 1; } assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); bpout.close(); fout3.close(); #ifndef NDEBUG if(sanityCheck) { BitPairReference bpr( outfile, // ebwt basename color, // expect color? true, // sanity check? &infiles,// files to check against NULL, // sequences to check against format == CMDLINE, // whether infiles contains strings true, // load sequence? false, // use memory-mapped files false, // use shared memory false, // sweep through memory-mapped memory false, // be talkative false); // be talkative } #endif } else { // Read in the sizes of all the unambiguous stretches of the // genome into a vector of RefRecords TIndexOff numSeqs = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs); #ifndef NDEBUG if(refparams.color) { refparams.color = false; vector<RefRecord> szs2; vector<uint32_t> plens2; TIndexOff numSeqs2 = 0; fastaRefReadSizes(is, szs2, plens2, refparams, NULL, numSeqs2); assert_leq(numSeqs, numSeqs2); // One less color than base refparams.color = true; } #endif } } if(justRef) return; assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); assert_gt(szs.size(), 0); // Construct Ebwt from input strings and parameters Ebwt<TStr> ebwt(refparams.color ? 1 : 0, lineRate, linesPerSide, offRate, // suffix-array sampling rate -1, // ISA sampling rate ftabChars, // number of chars in initial arrow-pair calc nthreads, outfile, // basename for .?.ebwt files !reverse, // fw !entireSA, // useBlockwise bmax, // block size for blockwise SA builder bmaxMultSqrt, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len noDc? 0 : dcv,// difference-cover period is, // list of input streams szs, // list of reference sizes plens, // list of not-all-gap reference sequence lengths (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate -1, // override isaRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanityCheck); // verify results and internal consistency // Note that the Ebwt is *not* resident in memory at this time. To // load it into memory, call ebwt.loadIntoMemory() if(verbose) { // Print Ebwt's vital stats ebwt.eh().print(cout); } if(sanityCheck) { // Try restoring the original string (if there were // multiple texts, what we'll get back is the joined, // padded string, not a list) ebwt.loadIntoMemory( refparams.color ? 1 : 0, -1, false, false); TStr s2; ebwt.restore(s2); ebwt.evictFromMemory(); { TStr joinedss = Ebwt<TStr>::join( is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed); // pseudo-random number generator seed if(refparams.reverse == REF_READ_REVERSE) { reverseInPlace(joinedss); } assert_eq(length(joinedss), length(s2)); assert_eq(joinedss, s2); } if(verbose) { if(length(s2) < 1000) { cout << "Passed restore check: " << s2 << endl; } else { cout << "Passed restore check: (" << length(s2) << " chars)" << endl; } } } }