/** * Bowtie main function. It is placed in a separate source file to * make it slightly easier to compile Bowtie as a library. * * If the user specifies -A <file> as the first two arguments, main * will interpret that file as having one set of command-line arguments * per line, and will dispatch each batch of arguments one at a time to * bowtie. */ int main(int argc, const char **argv) { if(argc > 2 && strcmp(argv[1], "-A") == 0) { const char *file = argv[2]; ifstream in; in.open(file); char buf[4096]; int lastret = -1; while(in.getline(buf, 4095)) { EList<string> args; args.push_back(string(argv[0])); tokenize(buf, " \t", args); const char **myargs = (const char**)malloc(sizeof(char*)*args.size()); for(size_t i = 0; i < args.size(); i++) { myargs[i] = args[i].c_str(); } if(args.size() == 1) continue; lastret = hisat2((int)args.size(), myargs); free(myargs); } if(lastret == -1) { cerr << "Warning: No arg strings parsed from " << file << endl; return 0; } return lastret; } else { return hisat2(argc, argv); } }
/** * Calculate a vector containing the sizes of all of the patterns in * all of the given input files, in order. Returns the total size of * all references combined. Rewinds each istream before returning. */ std::pair<size_t, size_t> fastaRefReadSizes( EList<FileBuf*>& in, EList<RefRecord>& recs, const RefReadInParams& rparms, BitpairOutFileBuf* bpout, int& numSeqs) { uint32_t unambigTot = 0; uint32_t bothTot = 0; RefReadInParams rpcp = rparms; assert_gt(in.size(), 0); // For each input istream for(size_t i = 0; i < in.size(); i++) { bool first = true; assert(!in[i]->eof()); // For each pattern in this istream while(!in[i]->eof()) { RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout); if((unambigTot + rec.len) < unambigTot) { cerr << "Error: Reference sequence has more than 2^32-1 characters! Please divide the" << endl << "reference into batches or chunks of about 3.6 billion characters or less each" << endl << "and index each independently." << endl; throw 1; } // Add the length of this record. if(rec.first) numSeqs++; unambigTot += rec.len; bothTot += rec.len; bothTot += rec.off; first = false; if(rec.len == 0 && rec.off == 0 && !rec.first) continue; recs.push_back(rec); } // Reset the input stream in[i]->reset(); assert(!in[i]->eof()); #ifndef NDEBUG // Check that it's really reset int c = in[i]->get(); assert_eq('>', c); in[i]->reset(); assert(!in[i]->eof()); #endif } assert_geq(bothTot, 0); assert_geq(unambigTot, 0); return make_pair( unambigTot, // total number of unambiguous DNA characters read bothTot); // total number of DNA characters read, incl. ambiguous ones }
/** * Given a list of edits and a DNA string representing the query * sequence, check that the edits are consistent with respect to the * query. */ bool Edit::repOk( const EList<Edit>& edits, const BTDnaString& s, bool fw, size_t trimBeg, size_t trimEnd) { if(!fw) { invertPoss(const_cast<EList<Edit>&>(edits), s.length()-trimBeg-trimEnd, false); swap(trimBeg, trimEnd); } for(size_t i = 0; i < edits.size(); i++) { const Edit& e = edits[i]; size_t pos = e.pos; if(i > 0) { assert_geq(pos, edits[i-1].pos); } bool del = false, mm = false; while(i < edits.size() && edits[i].pos == pos) { const Edit& ee = edits[i]; assert_lt(ee.pos, s.length()); if(ee.type != EDIT_TYPE_SPL) { if(ee.qchr != '-') { assert(ee.isRefGap() || ee.isMismatch()); assert_eq((int)ee.qchr, s.toChar(ee.pos+trimBeg)); } } if(ee.isMismatch()) { assert(!mm); mm = true; assert(!del); } else if(ee.isReadGap()) { assert(!mm); } else if(ee.isRefGap()) { assert(!mm); assert(!del); del = true; } else if(ee.isSpliced()) { } i++; } } if(!fw) { invertPoss(const_cast<EList<Edit>&>(edits), s.length()-trimBeg-trimEnd, false); } return true; }
/** * Create a BitPairReference encapsulating the reference portion of the * index at the given basename. Iterate through the reference * sequences, sending each one to print_ref_sequence to print. */ static void print_ref_sequences( ostream& fout, bool color, const EList<string>& refnames, const uint32_t* plen, const string& adjustedEbwtFileBase) { BitPairReference ref( adjustedEbwtFileBase, // input basename color, // true -> expect colorspace reference false, // sanity-check reference NULL, // infiles NULL, // originals false, // infiles are sequences false, // memory-map false, // use shared memory false, // sweep mm-mapped ref verbose, // be talkative verbose); // be talkative at startup assert_eq(ref.numRefs(), refnames.size()); for(size_t i = 0; i < ref.numRefs(); i++) { print_ref_sequence( fout, ref, refnames[i], i, plen[i] + (color ? 1 : 0)); } }
static void print_index_sequences(ostream& fout, Ebwt& ebwt) { EList<string>* refnames = &(ebwt.refnames()); TStr cat_ref; ebwt.restore(cat_ref); uint32_t curr_ref = 0xffffffff; string curr_ref_seq = ""; uint32_t curr_ref_len = 0xffffffff; uint32_t last_text_off = 0; size_t orig_len = cat_ref.length(); uint32_t tlen = 0xffffffff; bool first = true; for(size_t i = 0; i < orig_len; i++) { uint32_t tidx = 0xffffffff; uint32_t textoff = 0xffffffff; tlen = 0xffffffff; bool straddled = false; ebwt.joinedToTextOff(1 /* qlen */, (uint32_t)i, tidx, textoff, tlen, true, straddled); if (tidx != 0xffffffff && textoff < tlen) { if (curr_ref != tidx) { if (curr_ref != 0xffffffff) { // Add trailing gaps, if any exist if(curr_ref_seq.length() < curr_ref_len) { curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N'); } print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq); } curr_ref = tidx; curr_ref_seq = ""; curr_ref_len = tlen; last_text_off = 0; first = true; } uint32_t textoff_adj = textoff; if(first && textoff > 0) textoff_adj++; if (textoff_adj - last_text_off > 1) curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N'); curr_ref_seq.push_back(cat_ref[i]); last_text_off = textoff; first = false; } } if (curr_ref < refnames->size()) { // Add trailing gaps, if any exist if(curr_ref_seq.length() < curr_ref_len) { curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N'); } print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq); } }
/** * Clip off some of the high-numbered positions. */ void Edit::clipHi(EList<Edit>& ed, size_t len, size_t amt) { assert_leq(amt, len); size_t max = len - amt; size_t nrm = 0; for(size_t i = 0; i < ed.size(); i++) { size_t ii = ed.size() - i - 1; assert_lt(ed[ii].pos, len); if(ed[ii].pos > max) { nrm++; } else if(ed[ii].pos == max && !ed[ii].isReadGap()) { nrm++; } else { break; } } ed.resize(ed.size() - nrm); }
/** * Delete all the index files that we tried to create. For when we had to * abort the index-building process due to an error. */ static void deleteIdxFiles( const string& outfile, bool doRef, bool justRef) { for(size_t i = 0; i < filesWritten.size(); i++) { cerr << "Deleting \"" << filesWritten[i].c_str() << "\" file written during aborted indexing attempt." << endl; remove(filesWritten[i].c_str()); } }
/** * Merge second argument into the first. Assume both are sorted to * begin with. */ void Edit::merge(EList<Edit>& dst, const EList<Edit>& src) { size_t di = 0, si = 0; while(di < dst.size()) { if(src[si].pos < dst[di].pos) { dst.insert(src[si], di); si++; di++; } else if(src[si].pos == dst[di].pos) { // There can be two inserts at a given position, but we // can't merge them because there's no way to know their // order assert(src[si].isReadGap() != dst[di].isReadGap()); if(src[si].isReadGap()) { dst.insert(src[si], di); si++; di++; } else if(dst[di].isReadGap()) { di++; } } } while(si < src.size()) dst.push_back(src[si++]); }
/** * Check that this Ebwt, when restored via restore(), matches up with * the given array of reference sequences. For sanity checking. */ void Ebwt::checkOrigs( const EList<SString<char> >& os, bool color, bool mirror) const { SString<char> rest; restore(rest); uint32_t restOff = 0; size_t i = 0, j = 0; if(mirror) { // TODO: FIXME return; } while(i < os.size()) { size_t olen = os[i].length(); int lastorig = -1; for(; j < olen; j++) { size_t joff = j; if(mirror) joff = olen - j - 1; if((int)os[i][joff] == 4) { // Skip over Ns lastorig = -1; if(!mirror) { while(j < olen && (int)os[i][j] == 4) j++; } else { while(j < olen && (int)os[i][olen-j-1] == 4) j++; } j--; continue; } if(lastorig == -1 && color) { lastorig = os[i][joff]; continue; } if(color) { assert_neq(-1, lastorig); assert_eq(dinuc2color[(int)os[i][joff]][lastorig], rest[restOff]); } else { assert_eq(os[i][joff], rest[restOff]); } lastorig = (int)os[i][joff]; restOff++; } if(j == os[i].length()) { // Moved to next sequence i++; j = 0; } else { // Just jumped over a gap } } }
/** * Clip off some of the low-numbered positions. */ void Edit::clipLo(EList<Edit>& ed, size_t len, size_t amt) { size_t nrm = 0; for(size_t i = 0; i < ed.size(); i++) { assert_lt(ed[i].pos, len); if(ed[i].pos < amt) { nrm++; } else { // Shift everyone else up ed[i].pos -= (uint32_t)amt; } } ed.erase(0, nrm); }
/** * Given the values for all of the various arguments used to specify * the read and quality input, create a list of pattern sources to * dispense them. */ PairedPatternSource* PairedPatternSource::setupPatternSources( const EList<string>& si, // singles, from argv const EList<string>& m1, // mate1's, from -1 arg const EList<string>& m2, // mate2's, from -2 arg const EList<string>& m12, // both mates on each line, from --12 arg #ifdef USE_SRA const EList<string>& sra_accs, #endif const EList<string>& q, // qualities associated with singles const EList<string>& q1, // qualities associated with m1 const EList<string>& q2, // qualities associated with m2 const PatternParams& p, // read-in parameters size_t nthreads, bool verbose) // be talkative? { EList<PatternSource*>* a = new EList<PatternSource*>(); EList<PatternSource*>* b = new EList<PatternSource*>(); EList<PatternSource*>* ab = new EList<PatternSource*>(); // Create list of pattern sources for paired reads appearing // interleaved in a single file for(size_t i = 0; i < m12.size(); i++) { const EList<string>* qs = &m12; EList<string> tmp; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmp; tmp.push_back(m12[i]); assert_eq(1, tmp.size()); } ab->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads)); if(!p.fileParallel) { break; } } #ifdef USE_SRA for(size_t i = 0; i < sra_accs.size(); i++) { const EList<string>* qs = &sra_accs; EList<string> tmp; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmp; tmp.push_back(sra_accs[i]); assert_eq(1, tmp.size()); } ab->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads)); if(!p.fileParallel) { break; } } #endif // Create list of pattern sources for paired reads for(size_t i = 0; i < m1.size(); i++) { const EList<string>* qs = &m1; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(m1[i]); assert_eq(1, tmpSeq.size()); } a->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads)); if(!p.fileParallel) { break; } } // Create list of pattern sources for paired reads for(size_t i = 0; i < m2.size(); i++) { const EList<string>* qs = &m2; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(m2[i]); assert_eq(1, tmpSeq.size()); } b->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads)); if(!p.fileParallel) { break; } } // All mates/mate files must be paired assert_eq(a->size(), b->size()); // Create list of pattern sources for the unpaired reads for(size_t i = 0; i < si.size(); i++) { const EList<string>* qs = &si; PatternSource* patsrc = NULL; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(si[i]); assert_eq(1, tmpSeq.size()); } patsrc = PatternSource::patsrcFromStrings(p, *qs, nthreads); assert(patsrc != NULL); a->push_back(patsrc); b->push_back(NULL); if(!p.fileParallel) { break; } } PairedPatternSource *patsrc = NULL; #ifdef USE_SRA if(m12.size() > 0 || sra_accs.size() > 0) { #else if(m12.size() > 0) { #endif patsrc = new PairedSoloPatternSource(ab, p); for(size_t i = 0; i < a->size(); i++) delete (*a)[i]; for(size_t i = 0; i < b->size(); i++) delete (*b)[i]; delete a; delete b; } else { patsrc = new PairedDualPatternSource(a, b, p); for(size_t i = 0; i < ab->size(); i++) delete (*ab)[i]; delete ab; } return patsrc; } VectorPatternSource::VectorPatternSource( const EList<string>& v, const PatternParams& p) : PatternSource(p), cur_(p.skip), skip_(p.skip), paired_(false), v_(), quals_() { for(size_t i = 0; i < v.size(); i++) { EList<string> ss; tokenize(v[i], ":", ss, 2); assert_gt(ss.size(), 0); assert_leq(ss.size(), 2); // Initialize s string s = ss[0]; int mytrim5 = gTrim5; if(gColor && s.length() > 1) { // This may be a primer character. If so, keep it in the // 'primer' field of the read buf and parse the rest of the // read without it. int c = toupper(s[0]); if(asc2dnacat[c] > 0) { // First char is a DNA char int c2 = toupper(s[1]); // Second char is a color char if(asc2colcat[c2] > 0) { mytrim5 += 2; // trim primer and first color } } } if(gColor) { // Convert '0'-'3' to 'A'-'T' for(size_t i = 0; i < s.length(); i++) { if(s[i] >= '0' && s[i] <= '4') { s[i] = "ACGTN"[(int)s[i] - '0']; } if(s[i] == '.') s[i] = 'N'; } } if(s.length() <= (size_t)(gTrim3 + mytrim5)) { // Entire read is trimmed away s.clear(); } else { // Trim on 5' (high-quality) end if(mytrim5 > 0) { s.erase(0, mytrim5); } // Trim on 3' (low-quality) end if(gTrim3 > 0) { s.erase(s.length()-gTrim3); } } // Initialize vq string vq; if(ss.size() == 2) { vq = ss[1]; } // Trim qualities if(vq.length() > (size_t)(gTrim3 + mytrim5)) { // Trim on 5' (high-quality) end if(mytrim5 > 0) { vq.erase(0, mytrim5); } // Trim on 3' (low-quality) end if(gTrim3 > 0) { vq.erase(vq.length()-gTrim3); } } // Pad quals with Is if necessary; this shouldn't happen while(vq.length() < s.length()) { vq.push_back('I'); } // Truncate quals to match length of read if necessary; // this shouldn't happen if(vq.length() > s.length()) { vq.erase(s.length()); } assert_eq(vq.length(), s.length()); v_.expand(); v_.back().installChars(s); quals_.push_back(BTString(vq)); trimmed3_.push_back(gTrim3); trimmed5_.push_back(mytrim5); ostringstream os; os << (names_.size()); names_.push_back(BTString(os.str())); } assert_eq(v_.size(), quals_.size()); } bool VectorPatternSource::nextReadImpl( Read& r, TReadId& rdid, TReadId& endid, bool& success, bool& done) { // Let Strings begin at the beginning of the respective bufs r.reset(); lock(); if(cur_ >= v_.size()) { unlock(); // Clear all the Strings, as a signal to the caller that // we're out of reads r.reset(); success = false; done = true; assert(r.empty()); return false; } // Copy v_*, quals_* strings into the respective Strings r.color = gColor; r.patFw = v_[cur_]; r.qual = quals_[cur_]; r.trimmed3 = trimmed3_[cur_]; r.trimmed5 = trimmed5_[cur_]; ostringstream os; os << cur_; r.name = os.str(); cur_++; done = cur_ == v_.size(); rdid = endid = readCnt_; readCnt_++; unlock(); success = true; return true; }
/** * TODO: Argument parsing is very, very flawed. The biggest problem is that * there are two separate worlds of arguments, the ones set via polstr, and * the ones set directly in variables. This makes for nasty interactions, * e.g., with the -M option being resolved at an awkward time relative to * the -k and -a options. */ static void parseOption(int next_option, const char *arg) { switch (next_option) { case 's': skipReads = (uint32_t)parseInt(0, "-s arg must be positive", arg); break; case ARG_GAP_BAR: gGapBarrier = parseInt(1, "--gbar must be no less than 1", arg); break; case 'u': qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1", arg); break; case 'p': nthreads = parseInt(1, "-p/--threads arg must be at least 1", arg); break; case 'h': printUsage(cout); throw 0; break; case ARG_USAGE: printUsage(cout); throw 0; break; case ARG_VERBOSE: gVerbose = 1; break; case ARG_QUIET: gQuiet = true; break; case ARG_SANITY: sanityCheck = true; break; case ARG_CP_MIN: cminlen = parse<size_t>(arg); break; case ARG_CP_IVAL: cpow2 = parse<size_t>(arg); break; case ARG_TRI: doTri = true; break; case ARG_LOCAL: localAlign = true; break; case ARG_END_TO_END: localAlign = false; break; case ARG_SSE8: enable8 = true; break; case ARG_SSE8_NO: enable8 = false; break; case ARG_IGNORE_QUALS: ignoreQuals = true; break; case ARG_N_CEIL: { // Split argument by comma EList<string> args; tokenize(arg, ",", args); if(args.size() > 3) { cerr << "Error: expected 3 or fewer comma-separated " << "arguments to --n-ceil option, got " << args.size() << endl; throw 1; } if(args.size() == 0) { cerr << "Error: expected at least one argument to --n-ceil option" << endl; throw 1; } PARSE_FUNC(nCeil); break; } case ARG_SCORE_MA: { // Split argument by comma EList<string> args; tokenize(arg, ",", args); if(args.size() != 1) { cerr << "Error parsing --ma; RHS must have 1 token" << endl; assert(false); throw 1; } string tmp = args[0]; istringstream tmpss(tmp); tmpss >> bonusMatch; break; } case ARG_SCORE_MMP: { // Split argument by comma EList<string> args; tokenize(arg, ",", args); if(args.size() > 3) { cerr << "Error parsing --mmp " << "; RHS must have at most 3 tokens" << endl; assert(false); throw 1; } if(args[0][0] == 'C') { string tmp = args[0].substr(1); // Parse constant penalty istringstream tmpss(tmp); tmpss >> penMmcMax; penMmcMin = penMmcMax; // Parse constant penalty penMmcType = COST_MODEL_CONSTANT; } else if(args[0][0] == 'Q') { if(args.size() >= 2) { string tmp = args[1]; istringstream tmpss(tmp); tmpss >> penMmcMax; } else {
static void print_index_sequences(ostream& fout, Ebwt<index_t>& ebwt) { EList<string>* refnames = &(ebwt.refnames()); TStr cat_ref; ebwt.restore(cat_ref); HyperLogLogPlusMinus<uint64_t> kmer_counter; TIndexOffU curr_ref = OFF_MASK; string curr_ref_seq = ""; TIndexOffU curr_ref_len = OFF_MASK; TIndexOffU last_text_off = 0; size_t orig_len = cat_ref.length(); TIndexOffU tlen = OFF_MASK; bool first = true; for(size_t i = 0; i < orig_len; i++) { TIndexOffU tidx = OFF_MASK; TIndexOffU textoff = OFF_MASK; tlen = OFF_MASK; bool straddled = false; ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen, true, straddled); if (tidx != OFF_MASK && textoff < tlen) { if (curr_ref != tidx) { if (curr_ref != OFF_MASK) { // Add trailing gaps, if any exist if(curr_ref_seq.length() < curr_ref_len) { curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N'); } print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq); } curr_ref = tidx; curr_ref_seq = ""; curr_ref_len = tlen; last_text_off = 0; first = true; } TIndexOffU textoff_adj = textoff; if(first && textoff > 0) textoff_adj++; if (textoff_adj - last_text_off > 1) curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N'); curr_ref_seq.push_back("ACGT"[int(cat_ref[i])]); last_text_off = textoff; first = false; } } if (curr_ref < refnames->size()) { // Add trailing gaps, if any exist if(curr_ref_seq.length() < curr_ref_len) { curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N'); } print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq); } }
static void driver( const string& infile, EList<string>& infiles, const string& snpfile, const string& htfile, const string& ssfile, const string& exonfile, const string& svfile, const string& outfile, bool packed, int reverse) { initializeCntLut(); initializeCntBit(); EList<FileBuf*> is(MISC_CAT); bool bisulfite = false; RefReadInParams refparams(false, reverse, nsToAs, bisulfite); assert_gt(infiles.size(), 0); if(format == CMDLINE) { // Adapt sequence strings to stringstreams open for input stringstream *ss = new stringstream(); for(size_t i = 0; i < infiles.size(); i++) { (*ss) << ">" << i << endl << infiles[i].c_str() << endl; } FileBuf *fb = new FileBuf(ss); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } else { // Adapt sequence files to ifstreams for(size_t i = 0; i < infiles.size(); i++) { FILE *f = fopen(infiles[i].c_str(), "r"); if (f == NULL) { cerr << "Error: could not open "<< infiles[i].c_str() << endl; throw 1; } FileBuf *fb = new FileBuf(f); assert(fb != NULL); if(fb->peek() == -1 || fb->eof()) { cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl; continue; } assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } } if(is.empty()) { cerr << "Warning: All fasta inputs were empty" << endl; throw 1; } filesWritten.push_back(outfile + ".1." + gfm_ext); filesWritten.push_back(outfile + ".2." + gfm_ext); // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. EList<RefRecord> szs(MISC_CAT); std::pair<size_t, size_t> sztot; { if(verbose) cerr << "Reading reference sizes" << endl; Timer _t(cerr, " Time reading reference sizes: ", verbose); if(!reverse && (writeRef || justRef)) { filesWritten.push_back(outfile + ".3." + gfm_ext); filesWritten.push_back(outfile + ".4." + gfm_ext); sztot = BitPairReference::szsFromFasta(is, outfile, bigEndian, refparams, szs, sanityCheck); } else { sztot = BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck); } } if(justRef) return; assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); assert_gt(szs.size(), 0); // Construct index from input strings and parameters filesWritten.push_back(outfile + ".5." + gfm_ext); filesWritten.push_back(outfile + ".6." + gfm_ext); filesWritten.push_back(outfile + ".7." + gfm_ext); filesWritten.push_back(outfile + ".8." + gfm_ext); TStr s; HGFM<TIndexOffU> hGFM( s, packed, 1, // TODO: maybe not? lineRate, offRate, // suffix-array sampling rate ftabChars, // number of chars in initial arrow-pair calc localOffRate, localFtabChars, nthreads, snpfile, htfile, ssfile, exonfile, svfile, outfile, // basename for .?.ht2 files reverse == 0, // fw !entireSA, // useBlockwise bmax, // block size for blockwise SA builder bmaxMultSqrt, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len noDc? 0 : dcv,// difference-cover period is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanityCheck); // verify results and internal consistency // Note that the Ebwt is *not* resident in memory at this time. To // load it into memory, call ebwt.loadIntoMemory() if(verbose) { // Print Ebwt's vital stats hGFM.gh().print(cerr); } if(sanityCheck) { // Try restoring the original string (if there were // multiple texts, what we'll get back is the joined, // padded string, not a list) hGFM.loadIntoMemory( reverse ? (refparams.reverse == REF_READ_REVERSE) : 0, true, // load SA sample? true, // load ftab? true, // load rstarts? false, false); SString<char> s2; hGFM.restore(s2); hGFM.evictFromMemory(); { SString<char> joinedss = GFM<>::join<SString<char> >( is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed); // pseudo-random number generator seed if(refparams.reverse == REF_READ_REVERSE) { joinedss.reverse(); } assert_eq(joinedss.length(), s2.length()); assert(sstr_eq(joinedss, s2)); } if(verbose) { if(s2.length() < 1000) { cout << "Passed restore check: " << s2.toZBuf() << endl; } else { cout << "Passed restore check: (" << s2.length() << " chars)" << endl; } } } }
/** * Merman main driver function. Does the following: * * 1. Parses command-line options */ int merman(int argc, char **argv) { reset(); try { parseCommandLine(argc, argv); Timer tov(cerr, "Overall time: ", timing); EList<string> refstrs; ReferenceSet refs; EList<string> refnames; EList<size_t> reflens; string refstr = argv[optind++]; tokenize(refstr, ",", refstrs); auto_ptr<MerIndex> ind( new MerIndex(ap, rp, readLen, seedWidth, nk.first, nk.second, specificity, begin, naiveCheck, nthreads)); { Timer t(cerr, "... ", timing); if(timing) cerr << "Reading reference sequences..." << endl; for(size_t i = 0; i < refstrs.size(); i++) { if(timing) { cerr << " Sequence " << (i+1) << " of " << refstrs.size() << endl; } if(refIsStr) { refs.addOrigReferenceString(refstrs[i].c_str(), rp); } else { refs.addOrigReferenceFasta(refstrs[i].c_str(), rp); } } for(size_t i = 0; i < refs.numRefs(); i++) { refnames.push_back(string(refs[i].name.toZBuf())); reflens.push_back(refs[i].seq.length(color)); } if(refs.numRefs() == 0) { cerr << "Warning: No references were found" << endl; } if(rp.genCrick) { if(timing) { cerr << " Crickizing" << endl; } // Add the crick strand. If there were bisulfite // transformations to the Watson strand, they are // removed from the Watson strand before the Crick copy // is made. Transformations are then applied to the // new Crick strand. This has the effect of correctly // producing either Watson / Crick in the non-bisulfite // case, or BS Watson / BS Crick in the bisulfite case. refs.addReferenceRevComps(rp, false, 1, 0); } if(rp.genRevcomps) { if(timing) { cerr << " Adding reverse comps" << endl; } // Add reverse complements of all existing references // (after the transformations have already been // applied). refs.addReferenceRevComps(rp, true, -1, 1); } assert(refs.repOk()); } pair<size_t, size_t> mers = make_pair(0, 0); EList<MerIndexThread> threads; { Timer t(cerr, "... ", timing); if(timing) cerr << "Preparing to extract sub-sequences..." << endl; // Instantiate and run index threads assert_gt(nthreads, 0); threads.resize(nthreads); for(int i = 0; i < nthreads; i++) { threads[i].runCount(&refs, ind.get(), i, nthreads, color); } for(int i = 0; i < nthreads; i++) { pair<size_t, size_t> mrs = threads[i].join(); mers.first += mrs.first; mers.second += mrs.second; } ind->allocateMers(); } if(timing || verbose || justBlowup) { cerr << "Expecting index footprint of "; printBytes(mers.first * sizeof(mer_ent), cerr); cerr << endl; if(mers.first > mers.second) { cerr.setf(ios::fixed); cerr << " base footprint is "; printBytes(mers.second * sizeof(mer_ent), cerr); cerr << endl << " blowup factor: " << setprecision(2) << ((double)mers.first / (double)mers.second) << endl; } if(justBlowup) throw 0; } { Timer t(cerr, "... ", timing); if(timing) cerr << "Extracting index sub-sequences..." << endl; // Instantiate and run index threads for(int i = 0; i < nthreads; i++) { threads[i].runIndex(&refs, ind.get(), i, nthreads, color); } for(int i = 0; i < nthreads; i++) threads[i].join(); } assert_eq(mers.first, ind->size()); if(verbose) { cout << " read " << refs.numRefs() << " reference strings" << endl; } if(refs.empty() && iformat != INPUT_CHAININ) { cerr << "Index is empty; not enough reference sequence supplied" << endl; throw 1; } if(refs.numRefs() == 0 && iformat != INPUT_CHAININ) { cerr << "No reference strings provided; aborting..." << endl; throw 1; } { Timer t(cerr, "Sorting reference mers: ", timing); ind->sort(nthreads); // sort mers } { Timer t(cerr, "... ", timing); if(timing) cerr << "Aligning reads..." << endl; string rstr = argv[optind++]; // Instantiate reference map, which translates to new reference // coordinate system prior to alignment output auto_ptr<ReferenceMap> rmap( refmapFile == NULL ? NULL : new ReferenceMap(refmapFile, !refidx)); // Instantiate annotation map, which encodes SNP locations & alleles auto_ptr<AnnotationMap> amap( annotFile == NULL ? NULL : new AnnotationMap(annotFile)); // Instantiate the read-input object auto_ptr<Reads> rs( (iformat == INPUT_CMDLINE) ? (Reads*)new StringReads(rstr, begin) : ((iformat == INPUT_FASTA) ? (Reads*)new FastaReads(rstr, begin, bufsz) : ((iformat == INPUT_FASTA_CONT) ? (Reads*)new FastaContinuousReads( rstr, begin, fastaContLen, fastaContFreq, fcontBis, fcontRc, color) : ((iformat == INPUT_FASTQ) ? (Reads*)new FastqReads(rstr, solexaScale, sixty4off, begin, bufsz) : ((iformat == INPUT_CHAININ) ? (Reads*)new ChainReads(rstr, begin, bufsz) : ((iformat == INPUT_CSFASTA) ? (Reads*)new CSFastaReads(rstr, begin, bufsz) : ((iformat == INPUT_CSFASTA_AND_QV) ? (Reads*)new CSFastaAndQVReads(rstr, qualFile, begin, bufsz) : (Reads*)new CSFastqReads(rstr, solexaScale, sixty4off, begin, bufsz)))))))); // Set output stream string of = "-"; if(optind < argc) of = argv[optind++]; // Instantiate the alignment-output object auto_ptr<AlignOutput> outs( (oformat == OUTPUT_SAM) ? (AlignOutput*)new SamOutput(of, fullref, refidx, rp.bisulfiteC || rp.bisulfiteCpG, !samNoCsCq) : (AlignOutput*)new BowtieOutput(of, fullref, printCost, refidx, rp.bisulfiteC || rp.bisulfiteCpG)); outs->printHeader(refnames, reflens); // Run the progress thread, if requested ProgressThread proThread; if(progress) proThread.run(); // Instantiate and run search threads EList<SearchThread> sthreads; sthreads.resize(nthreads); for(int i = 0; i < (int)sthreads.size(); i++) { sthreads[i].init( i, (int)sthreads.size(), ind.get(), rs.get(), &refs, outs.get(), rmap.get(), amap.get()); sthreads[i].run(); } // Wait until search sthreads are finished for(size_t i = 0; i < sthreads.size(); i++) { sthreads[i].join(); } if(progress) { proThread.kill(); proThread.join(); } outs->flush(); } if(!quiet) ProgressThread::reportStats(); } catch(exception& e) { cerr << "Command: "; for(int i = 0; i < argc; i++) cerr << argv[i] << " "; cerr << endl; return 1; } catch(int e) { if(e != 0) { cerr << "Command: "; for(int i = 0; i < argc; i++) cerr << argv[i] << " "; cerr << endl; } return e; } return 0; }
VectorPatternSource::VectorPatternSource( const EList<string>& v, const PatternParams& p) : PatternSource(p), cur_(p.skip), skip_(p.skip), paired_(false), v_(), quals_() { for(size_t i = 0; i < v.size(); i++) { EList<string> ss; tokenize(v[i], ":", ss, 2); assert_gt(ss.size(), 0); assert_leq(ss.size(), 2); // Initialize s string s = ss[0]; int mytrim5 = gTrim5; if(gColor && s.length() > 1) { // This may be a primer character. If so, keep it in the // 'primer' field of the read buf and parse the rest of the // read without it. int c = toupper(s[0]); if(asc2dnacat[c] > 0) { // First char is a DNA char int c2 = toupper(s[1]); // Second char is a color char if(asc2colcat[c2] > 0) { mytrim5 += 2; // trim primer and first color } } } if(gColor) { // Convert '0'-'3' to 'A'-'T' for(size_t i = 0; i < s.length(); i++) { if(s[i] >= '0' && s[i] <= '4') { s[i] = "ACGTN"[(int)s[i] - '0']; } if(s[i] == '.') s[i] = 'N'; } } if(s.length() <= (size_t)(gTrim3 + mytrim5)) { // Entire read is trimmed away s.clear(); } else { // Trim on 5' (high-quality) end if(mytrim5 > 0) { s.erase(0, mytrim5); } // Trim on 3' (low-quality) end if(gTrim3 > 0) { s.erase(s.length()-gTrim3); } } // Initialize vq string vq; if(ss.size() == 2) { vq = ss[1]; } // Trim qualities if(vq.length() > (size_t)(gTrim3 + mytrim5)) { // Trim on 5' (high-quality) end if(mytrim5 > 0) { vq.erase(0, mytrim5); } // Trim on 3' (low-quality) end if(gTrim3 > 0) { vq.erase(vq.length()-gTrim3); } } // Pad quals with Is if necessary; this shouldn't happen while(vq.length() < s.length()) { vq.push_back('I'); } // Truncate quals to match length of read if necessary; // this shouldn't happen if(vq.length() > s.length()) { vq.erase(s.length()); } assert_eq(vq.length(), s.length()); v_.expand(); v_.back().installChars(s); quals_.push_back(BTString(vq)); trimmed3_.push_back(gTrim3); trimmed5_.push_back(mytrim5); ostringstream os; os << (names_.size()); names_.push_back(BTString(os.str())); } assert_eq(v_.size(), quals_.size()); }
/** * Reverse the 'src' list of RefRecords into the 'dst' list. Don't * modify 'src'. */ void reverseRefRecords( const EList<RefRecord>& src, EList<RefRecord>& dst, bool recursive, bool verbose) { dst.clear(); { EList<RefRecord> cur; for(int i = (int)src.size()-1; i >= 0; i--) { bool first = (i == (int)src.size()-1 || src[i+1].first); // Clause after the || on next line is to deal with empty FASTA // records at the end of the 'src' list, which would be wrongly // omitted otherwise. if(src[i].len || (first && src[i].off == 0)) { cur.push_back(RefRecord(0, src[i].len, first)); first = false; } if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first)); } bool mergedLast; for(int i = 0; i < (int)cur.size(); i++) { mergedLast = false; assert(cur[i].off == 0 || cur[i].len == 0); if(i < (int)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) { dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first)); i++; mergedLast = true; } else { dst.push_back(cur[i]); } } } //if(verbose) { // cout << "Source: " << endl; // printRecords(cout, src); // cout << "Dest: " << endl; // printRecords(cout, dst); //} #ifndef NDEBUG size_t srcnfirst = 0, dstnfirst = 0; for(size_t i = 0; i < src.size(); i++) { if(src[i].first) { srcnfirst++; } } for(size_t i = 0; i < dst.size(); i++) { if(dst[i].first) { dstnfirst++; } } assert_eq(srcnfirst, dstnfirst); if(!recursive) { EList<RefRecord> tmp; reverseRefRecords(dst, tmp, true); assert_eq(tmp.size(), src.size()); for(size_t i = 0; i < src.size(); i++) { assert_eq(src[i].len, tmp[i].len); assert_eq(src[i].off, tmp[i].off); assert_eq(src[i].first, tmp[i].first); } } #endif }
/** * For now, we pretend that the alignment is in the forward orientation * and that the Edits are listed from left- to right-hand side. */ void Edit::printQAlignNoCheck( std::ostream& os, const char *prefix, const BTDnaString& read, const EList<Edit>& edits) { size_t eidx = 0; os << prefix; // Print read for(size_t i = 0; i < read.length(); i++) { bool del = false, mm = false; while(eidx < edits.size() && edits[eidx].pos == i) { if(edits[eidx].isReadGap()) { os << '-'; } else if(edits[eidx].isRefGap()) { del = true; os << read.toChar(i); } else { mm = true; os << (char)edits[eidx].qchr; } eidx++; } if(!del && !mm) os << read.toChar(i); } os << endl; os << prefix; eidx = 0; // Print match bars for(size_t i = 0; i < read.length(); i++) { bool del = false, mm = false; while(eidx < edits.size() && edits[eidx].pos == i) { if(edits[eidx].isReadGap()) { os << ' '; } else if(edits[eidx].isRefGap()) { del = true; os << ' '; } else { mm = true; os << ' '; } eidx++; } if(!del && !mm) os << '|'; } os << endl; os << prefix; eidx = 0; // Print reference for(size_t i = 0; i < read.length(); i++) { bool del = false, mm = false; while(eidx < edits.size() && edits[eidx].pos == i) { if(edits[eidx].isReadGap()) { os << (char)edits[eidx].chr; } else if(edits[eidx].isRefGap()) { del = true; os << '-'; } else { mm = true; os << (char)edits[eidx].chr; } eidx++; } if(!del && !mm) os << read.toChar(i); } os << endl; }
/** * Print a list of edits to a std::ostream, separated by commas. */ void Edit::print(ostream& os, const EList<Edit>& edits, char delim) { for(size_t i = 0; i < edits.size(); i++) { os << edits[i]; if(i < edits.size()-1) os << delim; } }
/** * Given a read string and some edits, generate and append the corresponding * reference string to 'ref'. If read aligned to the Watson strand, the caller * should pass the original read sequence and original edits. If a read * aligned to the Crick strand, the caller should pass the reverse complement * of the read and a version of the edits list that has had Edit:invertPoss * called on it to cause edits to be listed in 3'-to-5' order. */ void Edit::toRef( const BTDnaString& read, const EList<Edit>& edits, BTDnaString& ref, bool fw, size_t trim5, size_t trim3) { // edits should be sorted size_t eidx = 0; // Print reference const size_t rdlen = read.length(); size_t trimBeg = fw ? trim5 : trim3; size_t trimEnd = fw ? trim3 : trim5; assert(Edit::repOk(edits, read, fw, trim5, trim3)); if(!fw) { invertPoss(const_cast<EList<Edit>&>(edits), read.length()-trimBeg-trimEnd, false); } for(size_t i = 0; i < rdlen; i++) { ASSERT_ONLY(int c = read[i]); assert_range(0, 4, c); bool del = false, mm = false; bool append = i >= trimBeg && rdlen - i - 1 >= trimEnd; bool appendIns = i >= trimBeg && rdlen - i >= trimEnd; while(eidx < edits.size() && edits[eidx].pos+trimBeg == i) { if(edits[eidx].isReadGap()) { // Inserted characters come before the position's // character if(appendIns) { ref.appendChar((char)edits[eidx].chr); } } else if(edits[eidx].isRefGap()) { assert_eq("ACGTN"[c], edits[eidx].qchr); del = true; } else if(edits[eidx].isMismatch()){ mm = true; assert(edits[eidx].qchr != edits[eidx].chr || edits[eidx].qchr == 'N'); assert_eq("ACGTN"[c], edits[eidx].qchr); if(append) { ref.appendChar((char)edits[eidx].chr); } } eidx++; } if(!del && !mm) { if(append) { ref.append(read[i]); } } } if(trimEnd == 0) { while(eidx < edits.size()) { assert_gt(rdlen, edits[eidx].pos); if(edits[eidx].isReadGap()) { ref.appendChar((char)edits[eidx].chr); } eidx++; } } if(!fw) { invertPoss(const_cast<EList<Edit>&>(edits), read.length()-trimBeg-trimEnd, false); } }
/** * Given the values for all of the various arguments used to specify * the read and quality input, create a list of pattern sources to * dispense them. */ PairedPatternSource* PairedPatternSource::setupPatternSources( const EList<string>& si, // singles, from argv const EList<string>& m1, // mate1's, from -1 arg const EList<string>& m2, // mate2's, from -2 arg const EList<string>& m12, // both mates on each line, from --12 arg const EList<string>& q, // qualities associated with singles const EList<string>& q1, // qualities associated with m1 const EList<string>& q2, // qualities associated with m2 const PatternParams& p, // read-in parameters bool verbose) // be talkative? { //std::cout << "setupPatternSources\n"; EList<PatternSource*>* a = new EList<PatternSource*>(); EList<PatternSource*>* b = new EList<PatternSource*>(); EList<PatternSource*>* ab = new EList<PatternSource*>(); // Create list of pattern sources for paired reads appearing // interleaved in a single file for(size_t i = 0; i < m12.size(); i++) { const EList<string>* qs = &m12; EList<string> tmp; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmp; tmp.push_back(m12[i]); assert_eq(1, tmp.size()); } ab->push_back(PatternSource::patsrcFromStrings(p, *qs)); if(!p.fileParallel) { break; } } // Create list of pattern sources for paired reads for(size_t i = 0; i < m1.size(); i++) { const EList<string>* qs = &m1; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(m1[i]); assert_eq(1, tmpSeq.size()); } a->push_back(PatternSource::patsrcFromStrings(p, *qs)); if(!p.fileParallel) { break; } } // Create list of pattern sources for paired reads for(size_t i = 0; i < m2.size(); i++) { const EList<string>* qs = &m2; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(m2[i]); assert_eq(1, tmpSeq.size()); } b->push_back(PatternSource::patsrcFromStrings(p, *qs)); if(!p.fileParallel) { break; } } // All mates/mate files must be paired assert_eq(a->size(), b->size()); // Create list of pattern sources for the unpaired reads for(size_t i = 0; i < si.size(); i++) { const EList<string>* qs = &si; PatternSource* patsrc = NULL; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(si[i]); assert_eq(1, tmpSeq.size()); } patsrc = PatternSource::patsrcFromStrings(p, *qs); assert(patsrc != NULL); a->push_back(patsrc); b->push_back(NULL); if(!p.fileParallel) { break; } } PairedPatternSource *patsrc = NULL; if(m12.size() > 0) { patsrc = new PairedSoloPatternSource(ab, p); for(size_t i = 0; i < a->size(); i++) delete (*a)[i]; for(size_t i = 0; i < b->size(); i++) delete (*b)[i]; delete a; delete b; } else { patsrc = new PairedDualPatternSource(a, b, p); for(size_t i = 0; i < ab->size(); i++) delete (*ab)[i]; delete ab; } return patsrc; }
static void printRecords(ostream& os, const EList<RefRecord>& l) { for(size_t i = 0; i < l.size(); i++) { os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl; } }