/** * Bowtie main function. It is placed in a separate source file to * make it slightly easier to compile Bowtie as a library. * * If the user specifies -A <file> as the first two arguments, main * will interpret that file as having one set of command-line arguments * per line, and will dispatch each batch of arguments one at a time to * bowtie. */ int main(int argc, const char **argv) { if(argc > 2 && strcmp(argv[1], "-A") == 0) { const char *file = argv[2]; ifstream in; in.open(file); char buf[4096]; int lastret = -1; while(in.getline(buf, 4095)) { EList<string> args; args.push_back(string(argv[0])); tokenize(buf, " \t", args); const char **myargs = (const char**)malloc(sizeof(char*)*args.size()); for(size_t i = 0; i < args.size(); i++) { myargs[i] = args[i].c_str(); } if(args.size() == 1) continue; lastret = bowtie((int)args.size(), myargs); free(myargs); } if(lastret == -1) { cerr << "Warning: No arg strings parsed from " << file << endl; return 0; } return lastret; } else { return bowtie(argc, argv); } }
EList ListGraph::getAdj(NodeID u) const{ EList lst; node* temp = ary[u].next; while(temp != NULL){ lst.push_back(temp->p); temp = temp->next; }; return lst; };
// @note The general idea of this method comes from https://github.com/ScottDVincent/HW05_vincensd_v2/ std::list<NWPair> MatrixGraph::getAdj(NodeID u) const { if (0 <= u < M.size()) { EList list; for(int i = 0; i < M.at(u).size(); i++) { if (M.at(u).at(i) != 0 ) { NWPair pair(i, M.at(u).at(i)); if (pair.second != 0) list.push_back(pair); } } return list; } }
std::list<NWPair> ListGraph::getAdj(NodeID u) const { EList temp; EList::const_iterator it; for(it = edgeList[u].begin(); it != edgeList[u].end(); it++) { NWPair theEdge = *it; if(theEdge.first != NULL) temp.push_back(NWPair(theEdge.first, theEdge.second)); } return temp; }
EList MatrixGraph::getAdj(NodeID u) const{ EList lst; //for the number of nodes for(int i = 0; i < num_nodes; i++){ // if they are atached if(ary[u][i] != 0){ //add to the vector lst.push_back(NWPair(i, ary[u][i])); }; }; return lst; };
/** * Calculate a vector containing the sizes of all of the patterns in * all of the given input files, in order. Returns the total size of * all references combined. Rewinds each istream before returning. */ std::pair<size_t, size_t> fastaRefReadSizes( EList<FileBuf*>& in, EList<RefRecord>& recs, const RefReadInParams& rparms, BitpairOutFileBuf* bpout, int& numSeqs) { uint32_t unambigTot = 0; uint32_t bothTot = 0; RefReadInParams rpcp = rparms; assert_gt(in.size(), 0); // For each input istream for(size_t i = 0; i < in.size(); i++) { bool first = true; assert(!in[i]->eof()); // For each pattern in this istream while(!in[i]->eof()) { RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout); if((unambigTot + rec.len) < unambigTot) { cerr << "Error: Reference sequence has more than 2^32-1 characters! Please divide the" << endl << "reference into batches or chunks of about 3.6 billion characters or less each" << endl << "and index each independently." << endl; throw 1; } // Add the length of this record. if(rec.first) numSeqs++; unambigTot += rec.len; bothTot += rec.len; bothTot += rec.off; first = false; if(rec.len == 0 && rec.off == 0 && !rec.first) continue; recs.push_back(rec); } // Reset the input stream in[i]->reset(); assert(!in[i]->eof()); #ifndef NDEBUG // Check that it's really reset int c = in[i]->get(); assert_eq('>', c); in[i]->reset(); assert(!in[i]->eof()); #endif } assert_geq(bothTot, 0); assert_geq(unambigTot, 0); return make_pair( unambigTot, // total number of unambiguous DNA characters read bothTot); // total number of DNA characters read, incl. ambiguous ones }
/** * Merge second argument into the first. Assume both are sorted to * begin with. */ void Edit::merge(EList<Edit>& dst, const EList<Edit>& src) { size_t di = 0, si = 0; while(di < dst.size()) { if(src[si].pos < dst[di].pos) { dst.insert(src[si], di); si++; di++; } else if(src[si].pos == dst[di].pos) { // There can be two inserts at a given position, but we // can't merge them because there's no way to know their // order assert(src[si].isReadGap() != dst[di].isReadGap()); if(src[si].isReadGap()) { dst.insert(src[si], di); si++; di++; } else if(dst[di].isReadGap()) { di++; } } } while(si < src.size()) dst.push_back(src[si++]); }
/** * Given the values for all of the various arguments used to specify * the read and quality input, create a list of pattern sources to * dispense them. */ PairedPatternSource* PairedPatternSource::setupPatternSources( const EList<string>& si, // singles, from argv const EList<string>& m1, // mate1's, from -1 arg const EList<string>& m2, // mate2's, from -2 arg const EList<string>& m12, // both mates on each line, from --12 arg #ifdef USE_SRA const EList<string>& sra_accs, #endif const EList<string>& q, // qualities associated with singles const EList<string>& q1, // qualities associated with m1 const EList<string>& q2, // qualities associated with m2 const PatternParams& p, // read-in parameters size_t nthreads, bool verbose) // be talkative? { EList<PatternSource*>* a = new EList<PatternSource*>(); EList<PatternSource*>* b = new EList<PatternSource*>(); EList<PatternSource*>* ab = new EList<PatternSource*>(); // Create list of pattern sources for paired reads appearing // interleaved in a single file for(size_t i = 0; i < m12.size(); i++) { const EList<string>* qs = &m12; EList<string> tmp; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmp; tmp.push_back(m12[i]); assert_eq(1, tmp.size()); } ab->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads)); if(!p.fileParallel) { break; } } #ifdef USE_SRA for(size_t i = 0; i < sra_accs.size(); i++) { const EList<string>* qs = &sra_accs; EList<string> tmp; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmp; tmp.push_back(sra_accs[i]); assert_eq(1, tmp.size()); } ab->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads)); if(!p.fileParallel) { break; } } #endif // Create list of pattern sources for paired reads for(size_t i = 0; i < m1.size(); i++) { const EList<string>* qs = &m1; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(m1[i]); assert_eq(1, tmpSeq.size()); } a->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads)); if(!p.fileParallel) { break; } } // Create list of pattern sources for paired reads for(size_t i = 0; i < m2.size(); i++) { const EList<string>* qs = &m2; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(m2[i]); assert_eq(1, tmpSeq.size()); } b->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads)); if(!p.fileParallel) { break; } } // All mates/mate files must be paired assert_eq(a->size(), b->size()); // Create list of pattern sources for the unpaired reads for(size_t i = 0; i < si.size(); i++) { const EList<string>* qs = &si; PatternSource* patsrc = NULL; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(si[i]); assert_eq(1, tmpSeq.size()); } patsrc = PatternSource::patsrcFromStrings(p, *qs, nthreads); assert(patsrc != NULL); a->push_back(patsrc); b->push_back(NULL); if(!p.fileParallel) { break; } } PairedPatternSource *patsrc = NULL; #ifdef USE_SRA if(m12.size() > 0 || sra_accs.size() > 0) { #else if(m12.size() > 0) { #endif patsrc = new PairedSoloPatternSource(ab, p); for(size_t i = 0; i < a->size(); i++) delete (*a)[i]; for(size_t i = 0; i < b->size(); i++) delete (*b)[i]; delete a; delete b; } else { patsrc = new PairedDualPatternSource(a, b, p); for(size_t i = 0; i < ab->size(); i++) delete (*ab)[i]; delete ab; } return patsrc; } VectorPatternSource::VectorPatternSource( const EList<string>& v, const PatternParams& p) : PatternSource(p), cur_(p.skip), skip_(p.skip), paired_(false), v_(), quals_() { for(size_t i = 0; i < v.size(); i++) { EList<string> ss; tokenize(v[i], ":", ss, 2); assert_gt(ss.size(), 0); assert_leq(ss.size(), 2); // Initialize s string s = ss[0]; int mytrim5 = gTrim5; if(gColor && s.length() > 1) { // This may be a primer character. If so, keep it in the // 'primer' field of the read buf and parse the rest of the // read without it. int c = toupper(s[0]); if(asc2dnacat[c] > 0) { // First char is a DNA char int c2 = toupper(s[1]); // Second char is a color char if(asc2colcat[c2] > 0) { mytrim5 += 2; // trim primer and first color } } } if(gColor) { // Convert '0'-'3' to 'A'-'T' for(size_t i = 0; i < s.length(); i++) { if(s[i] >= '0' && s[i] <= '4') { s[i] = "ACGTN"[(int)s[i] - '0']; } if(s[i] == '.') s[i] = 'N'; } } if(s.length() <= (size_t)(gTrim3 + mytrim5)) { // Entire read is trimmed away s.clear(); } else { // Trim on 5' (high-quality) end if(mytrim5 > 0) { s.erase(0, mytrim5); } // Trim on 3' (low-quality) end if(gTrim3 > 0) { s.erase(s.length()-gTrim3); } } // Initialize vq string vq; if(ss.size() == 2) { vq = ss[1]; } // Trim qualities if(vq.length() > (size_t)(gTrim3 + mytrim5)) { // Trim on 5' (high-quality) end if(mytrim5 > 0) { vq.erase(0, mytrim5); } // Trim on 3' (low-quality) end if(gTrim3 > 0) { vq.erase(vq.length()-gTrim3); } } // Pad quals with Is if necessary; this shouldn't happen while(vq.length() < s.length()) { vq.push_back('I'); } // Truncate quals to match length of read if necessary; // this shouldn't happen if(vq.length() > s.length()) { vq.erase(s.length()); } assert_eq(vq.length(), s.length()); v_.expand(); v_.back().installChars(s); quals_.push_back(BTString(vq)); trimmed3_.push_back(gTrim3); trimmed5_.push_back(mytrim5); ostringstream os; os << (names_.size()); names_.push_back(BTString(os.str())); } assert_eq(v_.size(), quals_.size()); } bool VectorPatternSource::nextReadImpl( Read& r, TReadId& rdid, TReadId& endid, bool& success, bool& done) { // Let Strings begin at the beginning of the respective bufs r.reset(); lock(); if(cur_ >= v_.size()) { unlock(); // Clear all the Strings, as a signal to the caller that // we're out of reads r.reset(); success = false; done = true; assert(r.empty()); return false; } // Copy v_*, quals_* strings into the respective Strings r.color = gColor; r.patFw = v_[cur_]; r.qual = quals_[cur_]; r.trimmed3 = trimmed3_[cur_]; r.trimmed5 = trimmed5_[cur_]; ostringstream os; os << cur_; r.name = os.str(); cur_++; done = cur_ == v_.size(); rdid = endid = readCnt_; readCnt_++; unlock(); success = true; return true; }
static void driver( const string& infile, EList<string>& infiles, const string& snpfile, const string& htfile, const string& ssfile, const string& exonfile, const string& svfile, const string& outfile, bool packed, int reverse) { initializeCntLut(); initializeCntBit(); EList<FileBuf*> is(MISC_CAT); bool bisulfite = false; RefReadInParams refparams(false, reverse, nsToAs, bisulfite); assert_gt(infiles.size(), 0); if(format == CMDLINE) { // Adapt sequence strings to stringstreams open for input stringstream *ss = new stringstream(); for(size_t i = 0; i < infiles.size(); i++) { (*ss) << ">" << i << endl << infiles[i].c_str() << endl; } FileBuf *fb = new FileBuf(ss); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } else { // Adapt sequence files to ifstreams for(size_t i = 0; i < infiles.size(); i++) { FILE *f = fopen(infiles[i].c_str(), "r"); if (f == NULL) { cerr << "Error: could not open "<< infiles[i].c_str() << endl; throw 1; } FileBuf *fb = new FileBuf(f); assert(fb != NULL); if(fb->peek() == -1 || fb->eof()) { cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl; continue; } assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } } if(is.empty()) { cerr << "Warning: All fasta inputs were empty" << endl; throw 1; } filesWritten.push_back(outfile + ".1." + gfm_ext); filesWritten.push_back(outfile + ".2." + gfm_ext); // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. EList<RefRecord> szs(MISC_CAT); std::pair<size_t, size_t> sztot; { if(verbose) cerr << "Reading reference sizes" << endl; Timer _t(cerr, " Time reading reference sizes: ", verbose); if(!reverse && (writeRef || justRef)) { filesWritten.push_back(outfile + ".3." + gfm_ext); filesWritten.push_back(outfile + ".4." + gfm_ext); sztot = BitPairReference::szsFromFasta(is, outfile, bigEndian, refparams, szs, sanityCheck); } else { sztot = BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck); } } if(justRef) return; assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); assert_gt(szs.size(), 0); // Construct index from input strings and parameters filesWritten.push_back(outfile + ".5." + gfm_ext); filesWritten.push_back(outfile + ".6." + gfm_ext); filesWritten.push_back(outfile + ".7." + gfm_ext); filesWritten.push_back(outfile + ".8." + gfm_ext); TStr s; HGFM<TIndexOffU> hGFM( s, packed, 1, // TODO: maybe not? lineRate, offRate, // suffix-array sampling rate ftabChars, // number of chars in initial arrow-pair calc localOffRate, localFtabChars, nthreads, snpfile, htfile, ssfile, exonfile, svfile, outfile, // basename for .?.ht2 files reverse == 0, // fw !entireSA, // useBlockwise bmax, // block size for blockwise SA builder bmaxMultSqrt, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len noDc? 0 : dcv,// difference-cover period is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanityCheck); // verify results and internal consistency // Note that the Ebwt is *not* resident in memory at this time. To // load it into memory, call ebwt.loadIntoMemory() if(verbose) { // Print Ebwt's vital stats hGFM.gh().print(cerr); } if(sanityCheck) { // Try restoring the original string (if there were // multiple texts, what we'll get back is the joined, // padded string, not a list) hGFM.loadIntoMemory( reverse ? (refparams.reverse == REF_READ_REVERSE) : 0, true, // load SA sample? true, // load ftab? true, // load rstarts? false, false); SString<char> s2; hGFM.restore(s2); hGFM.evictFromMemory(); { SString<char> joinedss = GFM<>::join<SString<char> >( is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed); // pseudo-random number generator seed if(refparams.reverse == REF_READ_REVERSE) { joinedss.reverse(); } assert_eq(joinedss.length(), s2.length()); assert(sstr_eq(joinedss, s2)); } if(verbose) { if(s2.length() < 1000) { cout << "Passed restore check: " << s2.toZBuf() << endl; } else { cout << "Passed restore check: (" << s2.length() << " chars)" << endl; } } } }
/** * A way of feeding simply tests to the seed alignment infrastructure. */ int main(int argc, char **argv) { EList<string> strs; // GCTATATAGCGCGCTCGCATCATTTTGTGT strs.push_back(string("CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA" "NNNNNNNNNN" "CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA")); // GCTATATAGCGCGCTTGCATCATTTTGTGT // ^ bool packed = false; int color = 0; pair<GFM*, GFM*> gfms = GFM::fromStrings<SString<char> >( strs, packed, REF_READ_REVERSE, Ebwt::default_bigEndian, Ebwt::default_lineRate, Ebwt::default_offRate, Ebwt::default_ftabChars, ".aligner_seed2.cpp.tmp", Ebwt::default_useBlockwise, Ebwt::default_bmax, Ebwt::default_bmaxMultSqrt, Ebwt::default_bmaxDivN, Ebwt::default_dcv, Ebwt::default_seed, false, // verbose false, // autoMem false); // sanity gfms.first->loadIntoMemory (-1, true, true, true, true, false); gfms.second->loadIntoMemory(1, true, true, true, true, false); int testnum = 0; // Query is longer than ftab and matches exactly twice for(int rc = 0; rc < 2; rc++) { for(int i = 0; i < 2; i++) { cerr << "Test " << (++testnum) << endl; cerr << " Query with length greater than ftab" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; // Set up the read BTDnaString seq ("GCTATATAGCGCGCTCGCATCATTTTGTGT", true); BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc"); if(rc) { seq.reverseComp(); qual.reverse(); } dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; conf.cons.init(GFM::default_ftabChars, 1.0); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig (i == 0) ? 0 : (seq.length() - 1), // 5' offset into read of root (i == 0) ? true : false, // left-to-right? rc == 0, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert_eq(2, dr.sink().nelt()); } } // Query has length euqal to ftab and matches exactly twice for(int i = 0; i < 2; i++) { cerr << "Test " << (++testnum) << endl; cerr << " Query with length equal to ftab" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; // Set up the read BTDnaString seq ("GCTATATAGC", true); BTString qual("ABCDEFGHIa"); dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; conf.cons.init(GFM::default_ftabChars, 1.0); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig (i == 0) ? 0 : (seq.length() - 1), // 5' offset into read of root (i == 0) ? true : false, // left-to-right? true, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert_eq(2, dr.sink().nelt()); } // Query has length less than ftab length and matches exactly twice for(int i = 0; i < 2; i++) { cerr << "Test " << (++testnum) << endl; cerr << " Query with length less than ftab" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; // Set up the read BTDnaString seq ("GCTATATAG", true); BTString qual("ABCDEFGHI"); dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; conf.cons.init(GFM::default_ftabChars, 1.0); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig (i == 0) ? 0 : (seq.length() - 1), // 5' offset into read of root (i == 0) ? true : false, // left-to-right? true, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert_eq(2, dr.sink().nelt()); } // Search root is in the middle of the read, requiring a bounce for(int i = 0; i < 2; i++) { cerr << "Test " << (++testnum) << endl; cerr << " Search root in middle of read" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; // Set up the read // 012345678901234567890123456789 BTDnaString seq ("GCTATATAGCGCGCTCGCATCATTTTGTGT", true); BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc"); TIndexOffU top, bot; top = bot = 0; bool ret = gfms.first->contains("GCGCTCGCATCATTTTGTGT", &top, &bot); cerr << ret << ", " << top << ", " << bot << endl; dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; conf.cons.init(GFM::default_ftabChars, 1.0); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig (i == 0) ? 10 : (seq.length() - 1 - 10), // 5' offset into read of root (i == 0) ? true : false, // left-to-right? true, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert_eq(2, dr.sink().nelt()); } delete gfms.first; delete gfms.second; strs.clear(); strs.push_back(string("CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA" "NNNNNNNNNN" "CATGTCAGCTATATAGCG")); gfms = GFM::fromStrings<SString<char> >( strs, packed, REF_READ_REVERSE, GFM::default_bigEndian, GFM::default_lineRate, GFM::default_offRate, GFM::default_ftabChars, ".aligner_seed2.cpp.tmp", GFM::default_useBlockwise, GFM::default_bmax, GfM::default_bmaxMultSqrt, GFM::default_bmaxDivN, GFM::default_dcv, GFM::default_seed, false, // verbose false, // autoMem false); // sanity gfms.first->loadIntoMemory (-1, true, true, true, true, false); gfms.second->loadIntoMemory(1, true, true, true, true, false); // Query is longer than ftab and matches exactly once. One search root for // forward read. { size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { BTDnaString seq ("GCTATATAGCGCGCTCGCATCATTTTGTGT", true); BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc"); for(size_t j = 0; j < seq.length(); j++) { cerr << "Test " << (++testnum) << endl; cerr << " Query with length greater than ftab and matches exactly once" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; // Set up the read dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; conf.cons.init(GFM::default_ftabChars, 1.0); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? true, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); assert_eq(1, dr.sink().nelt()); } } } // Query is longer than ftab and its reverse complement matches exactly // once. Search roots on forward and reverse-comp reads. { size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { BTDnaString seq ("GCTATATAGCGCGCTCGCATCATTTTGTGT", true); BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc"); for(size_t j = 0; j < seq.length(); j++) { cerr << "Test " << (++testnum) << endl; cerr << " Query with length greater than ftab and reverse complement matches exactly once" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; // Set up the read dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; conf.cons.init(GFM::default_ftabChars, 1.0); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? true, // forward? 0.0f); // root priority dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? false, // forward? 1.0f); // root priority // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); assert_eq(1, dr.sink().nelt()); } } } // Query is longer than ftab and matches exactly once with one mismatch { size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { // Set up the read // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA // |||||||||||||||||||||||||||||| BTDnaString orig("GCTATATAGCGCGCTCGCATCATTTTGTGT", true); // 012345678901234567890123456789 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc"); for(size_t k = 0; k < orig.length(); k++) { BTDnaString seq = orig; seq.set(seq[k] ^ 3, k); for(size_t j = 0; j < seq.length(); j++) { // Assume left-to-right size_t beg = j; size_t end = j + GFM::default_ftabChars; // Mismatch penalty is 3, so we have to skip starting // points that are within 2 from the mismatch if((i > 0 && j > 0) || j == seq.length()-1) { // Right-to-left if(beg < GFM::default_ftabChars) { beg = 0; } else { beg -= GFM::default_ftabChars; } end -= GFM::default_ftabChars; } size_t kk = k; //if(rc) { // kk = seq.length() - k - 1; //} if(beg <= kk && end > kk) { continue; } if((j > kk) ? (j - kk <= 2) : (kk - j <= 2)) { continue; } cerr << "Test " << (++testnum) << endl; cerr << " Query with length greater than ftab and matches exactly once with 1mm" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; // Changed conf.cons.init(0, 1.0); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? true, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl; assert_eq(1, dr.sink().nelt()); last_topf = dr.sink()[0].topf; last_botf = dr.sink()[0].botf; } } } } // Query is longer than ftab and matches exactly once with one N mismatch { size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { // Set up the read // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA // |||||||||||||||||||||||||||||| BTDnaString orig("GCTATATAGCGCGCTCGCATCATTTTGTGT", true); // 012345678901234567890123456789 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc"); for(size_t k = 0; k < orig.length(); k++) { BTDnaString seq = orig; seq.set(4, k); for(size_t j = 0; j < seq.length(); j++) { // Assume left-to-right size_t beg = j; size_t end = j + GFM::default_ftabChars; // Mismatch penalty is 3, so we have to skip starting // points that are within 2 from the mismatch if((i > 0 && j > 0) || j == seq.length()-1) { // Right-to-left if(beg < GFM::default_ftabChars) { beg = 0; } else { beg -= GFM::default_ftabChars; } end -= GFM::default_ftabChars; } if(beg <= k && end > k) { continue; } if((j > k) ? (j - k <= 2) : (k - j <= 2)) { continue; } cerr << "Test " << (++testnum) << endl; cerr << " Query with length greater than ftab and matches exactly once with 1mm" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; // Changed conf.cons.init(0, 1.0); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? true, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert_eq(sc.n(40), dr.sink()[0].pen); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl; assert_eq(1, dr.sink().nelt()); last_topf = dr.sink()[0].topf; last_botf = dr.sink()[0].botf; } } } } // Throw a bunch of queries with a bunch of Ns in and try to force an assert { RandomSource rnd(79); for(int i = 0; i < 2; i++) { // Set up the read // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA // |||||||||||||||||||||||||||||| BTDnaString orig("GCTATATAGCGCGCTCGCATCATTTTGTGT", true); // 012345678901234567890123456789 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc"); if(i == 1) { orig.reverseComp(); qual.reverse(); } for(size_t trials = 0; trials < 100; trials++) { BTDnaString seq = orig; size_t ns = 10; for(size_t k = 0; k < ns; k++) { size_t pos = rnd.nextU32() % seq.length(); seq.set(4, pos); } cerr << "Test " << (++testnum) << endl; cerr << " Query with a bunch of Ns" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; // Changed conf.cons.init(GFM::default_ftabChars, 1.0); conf.expol = DESC_EX_NONE; // Set up the search roots for(size_t k = 0; k < ns; k++) { size_t j = rnd.nextU32() % seq.length(); bool ltr = (rnd.nextU2() == 0) ? true : false; bool fw = (rnd.nextU2() == 0) ? true : false; dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root ltr, // left-to-right? fw, // forward? 0.0f); // root priority } // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); } } } // Query is longer than ftab and matches exactly once with one mismatch { RandomSource rnd(77); size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { // Set up the read // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA // |||||||||||||||||||||||||||||| BTDnaString orig("GCTATATAGCGCGCTCGCATCATTTTGTGT", true); // 012345678901234567890123456789 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIabc"); // revcomp: ACACAAAATGATGCGAGCGCGCTATATAGC // revqual: cbaIHGFEDCBAihgfedcbaIHGFEDCBA bool fwi = (i == 0); if(!fwi) { orig.reverseComp(); } for(size_t k = 0; k < orig.length(); k++) { BTDnaString seq = orig; seq.set(seq[k] ^ 3, k); cerr << "Test " << (++testnum) << endl; cerr << " Query with length greater than ftab and matches exactly once with 1mm. Many search roots." << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; // Changed conf.cons.init(0, 1.0); conf.expol = DESC_EX_NONE; // Set up several random search roots bool onegood = false; for(size_t y = 0; y < 10; y++) { size_t j = rnd.nextU32() % seq.length(); bool ltr = (rnd.nextU2() == 0) ? true : false; bool fw = (rnd.nextU2() == 0) ? true : false; dr.addRoot( conf, // DescentConfig (TReadOff)j, // 5' offset into read of root ltr, // left-to-right? fw, // forward? (float)((float)y * 1.0f)); // root priority // Assume left-to-right size_t beg = j; size_t end = j + GFM::default_ftabChars; // Mismatch penalty is 3, so we have to skip starting // points that are within 2 from the mismatch if(!ltr) { // Right-to-left if(beg < GFM::default_ftabChars) { beg = 0; } else { beg -= GFM::default_ftabChars; } end -= GFM::default_ftabChars; } bool good = true; if(fw != fwi) { good = false; } if(beg <= k && end > k) { good = false; } if((j > k) ? (j - k <= 2) : (k - j <= 2)) { good = false; } if(good) { onegood = true; } } if(!onegood) { continue; } // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl; assert_eq(1, dr.sink().nelt()); last_topf = dr.sink()[0].topf; last_botf = dr.sink()[0].botf; } } } // Query is longer than ftab and matches exactly once with one read gap { size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { for(int k = 0; k < 2; k++) { // Set up the read // GCTATATAGCGCGCCTGCATCATTTTGTGT // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA // |||||||||||||||/////////////// BTDnaString seq ("GCTATATAGCGCGCTGCATCATTTTGTGT", true); // 01234567890123456789012345678 // 87654321098765432109876543210 BTString qual("ABCDEFGHIabcdefghiABCDEFGHIab"); if(k == 1) { seq.reverseComp(); qual.reverse(); } assert_eq(seq.length(), qual.length()); // js iterate over offsets from 5' end for the search root for(size_t j = 0; j < seq.length(); j++) { // Assume left-to-right size_t beg = j; if(k == 1) { beg = seq.length() - beg - 1; } size_t end = beg + GFM::default_ftabChars; // Mismatch penalty is 3, so we have to skip starting // points that are within 2 from the mismatch if((i > 0 && j > 0) || j == seq.length()-1) { // Right-to-left if(beg < GFM::default_ftabChars) { beg = 0; } else { beg -= GFM::default_ftabChars; } end -= GFM::default_ftabChars; } assert_geq(end, beg); if(beg <= 15 && end >= 15) { continue; } cerr << "Test " << (++testnum) << endl; cerr << " Query matches once with a read gap of length 1" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; Read q("test", seq.toZBuf(), qual.toZBuf()); assert(q.repOk()); dr.initRead(q, -30, 30); // Set up the DescentConfig DescentConfig conf; // Changed conf.cons.init(0, 0.5); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? k == 0, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert_eq(sc.readGapOpen() + 0 * sc.readGapExtend(), dr.sink()[0].pen); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl; assert_eq(1, dr.sink().nelt()); last_topf = dr.sink()[0].topf; last_botf = dr.sink()[0].botf; } }} } // Query is longer than ftab and matches exactly once with one read gap of // length 3 { size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { for(int k = 0; k < 2; k++) { // Set up the read // GCTATATAGCGCGCGCTCATCATTTTGTGT // Ref: CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAACCA // |||||||||||||| ||||||||||||| BTDnaString seq ("GCTATATAGCGCGC" "CATCATTTTGTGT", true); // 01234567890123 4567890123456 // 65432109876543 2109876543210 BTString qual("ABCDEFGHIabcde" "fghiABCDEFGHI"); if(k == 1) { seq.reverseComp(); qual.reverse(); } for(size_t j = 0; j < seq.length(); j++) { // Assume left-to-right size_t beg = j; if(k == 1) { beg = seq.length() - beg - 1; } size_t end = beg + GFM::default_ftabChars; // Mismatch penalty is 3, so we have to skip starting // points that are within 2 from the mismatch if((i > 0 && j > 0) || j == seq.length()-1) { // Right-to-left if(beg < GFM::default_ftabChars) { beg = 0; } else { beg -= GFM::default_ftabChars; } end -= GFM::default_ftabChars; } if(beg <= 14 && end >= 14) { continue; } cerr << "Test " << (++testnum) << endl; cerr << " Query matches once with a read gap of length 3" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; // Changed conf.cons.init(0, 0.2); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? k == 0, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); // Need to adjust the mismatch penalty up to avoid alignments // with lots of mismatches. sc.setMmPen(COST_MODEL_CONSTANT, 6, 6); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert_eq(sc.readGapOpen() + 2 * sc.readGapExtend(), dr.sink()[0].pen); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl; assert_eq(1, dr.sink().nelt()); last_topf = dr.sink()[0].topf; last_botf = dr.sink()[0].botf; } }} } // Query is longer than ftab and matches exactly once with one reference gap { size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { // Set up the read // Ref: CATGTCAGCTATATAGCGCGC" "TCGCATCATTTTGTGTGTAAACCA // |||||||||||||| |||||||||||||||| BTDnaString seq ("GCTATATAGCGCGCA""TCGCATCATTTTGTGT", true); // 012345678901234 5678901234567890 BTString qual("ABCDEFGHIabcdef""ghiABCDEFGHIabcd"); for(size_t j = 0; j < seq.length(); j++) { // Assume left-to-right size_t beg = j; size_t end = j + GFM::default_ftabChars; // Mismatch penalty is 3, so we have to skip starting // points that are within 2 from the mismatch if((i > 0 && j > 0) || j == seq.length()-1) { // Right-to-left if(beg < GFM::default_ftabChars) { beg = 0; } else { beg -= GFM::default_ftabChars; } end -= GFM::default_ftabChars; } if(beg <= 14 && end >= 14) { continue; } cerr << "Test " << (++testnum) << endl; cerr << " Query matches once with a reference gap of length 1" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; // Changed conf.cons.init(1, 0.5); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? true, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); // Need to adjust the mismatch penalty up to avoid alignments // with lots of mismatches. sc.setMmPen(COST_MODEL_CONSTANT, 6, 6); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert_eq(sc.refGapOpen() + 0 * sc.refGapExtend(), dr.sink()[0].pen); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl; assert_eq(1, dr.sink().nelt()); last_topf = dr.sink()[0].topf; last_botf = dr.sink()[0].botf; } } } // Query is longer than ftab and matches exactly once with one reference gap { size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { // Set up the read // Ref: CATGTCAGCTATATAGCGCGC" "TCGCATCATTTTGTGTGTAAACCA // |||||||||||||| |||||||||||||||| BTDnaString seq ("GCTATATAGCGCGCATG""TCGCATCATTTTGTGT", true); // 01234567890123456 7890123456789012 BTString qual("ABCDEFGHIabcdefgh""iABCDEFGHIabcdef"); for(size_t j = 0; j < seq.length(); j++) { // Assume left-to-right size_t beg = j; size_t end = j + GFM::default_ftabChars; // Mismatch penalty is 3, so we have to skip starting // points that are within 2 from the mismatch if((i > 0 && j > 0) || j == seq.length()-1) { // Right-to-left if(beg < GFM::default_ftabChars) { beg = 0; } else { beg -= GFM::default_ftabChars; } end -= GFM::default_ftabChars; } if(beg <= 14 && end >= 14) { continue; } if(beg <= 15 && end >= 15) { continue; } if(beg <= 16 && end >= 16) { continue; } cerr << "Test " << (++testnum) << endl; cerr << " Query matches once with a reference gap of length 1" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30); // Set up the DescentConfig DescentConfig conf; // Changed conf.cons.init(1, 0.25); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? true, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); // Need to adjust the mismatch penalty up to avoid alignments // with lots of mismatches. sc.setMmPen(COST_MODEL_CONSTANT, 6, 6); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert_eq(sc.refGapOpen() + 2 * sc.refGapExtend(), dr.sink()[0].pen); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl; assert_eq(1, dr.sink().nelt()); last_topf = dr.sink()[0].topf; last_botf = dr.sink()[0].botf; } } } // Query is longer than ftab and matches exactly once with one read gap, // one ref gap, and one mismatch { size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { // Set up the read // Ref: CATGTCAGCT ATATAGCGCGCT CGCATCATTTTGTGTGTAAACCA // |||||||||| |||||||||||| |||||| ||||||||||||| BTDnaString seq ("CATGTCAGCT""GATATAGCGCGCT" "GCATCAATTTGTGTGTAAAC", true); // 0123456789 0123456789012 34567890123456789012 BTString qual("ABCDEFGHIa""bcdefghiACDEF" "GHIabcdefghijkABCDEF"); for(size_t j = 0; j < seq.length(); j++) { // Assume left-to-right size_t beg = j; size_t end = j + GFM::default_ftabChars; // Mismatch penalty is 3, so we have to skip starting // points that are within 2 from the mismatch if((i > 0 && j > 0) || j == seq.length()-1) { // Right-to-left if(beg < GFM::default_ftabChars) { beg = 0; } else { beg -= GFM::default_ftabChars; } end -= GFM::default_ftabChars; } if(beg <= 10 && end >= 10) { continue; } if(beg <= 22 && end >= 22) { continue; } if(beg <= 30 && end >= 30) { continue; } cerr << "Test " << (++testnum) << endl; cerr << " Query matches once with a read gap of length 1" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -50, 50); // Set up the DescentConfig DescentConfig conf; // Changed conf.cons.init(1, 0.5); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? true, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(1, dr.sink().nrange()); assert_eq(sc.readGapOpen() + sc.refGapOpen() + sc.mm((int)'d' - 33), dr.sink()[0].pen); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl; assert_eq(1, dr.sink().nelt()); last_topf = dr.sink()[0].topf; last_botf = dr.sink()[0].botf; } } } delete gfms.first; delete gfms.second; // Ref CATGTCAGCT-ATATAGCGCGCTCGCATCATTTTGTGTGTAAAC // |||||||||| |||||||||||| |||||| ||||||||||||| // Rd CATGTCAGCTGATATAGCGCGCT-GCATCAATTTGTGTGTAAAC strs.clear(); strs.push_back(string("CATGTCAGCTATATAGCGCGCTCGCATCATTTTGTGTGTAAAC" "NNNNNNNNNN" "CATGTCAGCTGATATAGCGCGCTCGCATCATTTTGTGTGTAAAC" // same but without first ref gap "N" "CATGTCAGCTATATAGCGCGCTGCATCATTTTGTGTGTAAAC" // same but without first read gap "N" "CATGTCAGCTATATAGCGCGCTCGCATCAATTTGTGTGTAAAC" // same but without first mismatch "N" "CATGTCAGCTGATATAGCGCGCTGCATCAATTTGTGTGTAAAC" // Exact match for read )); gfms = GFM::fromStrings<SString<char> >( strs, packed, REF_READ_REVERSE, GFM::default_bigEndian, GFM::default_lineRate, GFM::default_offRate, GFM::default_ftabChars, ".aligner_seed2.cpp.tmp", GFM::default_useBlockwise, GFM::default_bmax, GFM::default_bmaxMultSqrt, GFM::default_bmaxDivN, GFM::default_dcv, GFM::default_seed, false, // verbose false, // autoMem false); // sanity gfms.first->loadIntoMemory (color, -1, true, true, true, true, false); gfms.second->loadIntoMemory(color, 1, true, true, true, true, false); // Query is longer than ftab and matches exactly once with one read gap, // one ref gap, and one mismatch { size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { // Set up the read // Ref: CATGTCAGCT ATATAGCGCGCT CGCATCATTTTGTGTGTAAACCA // |||||||||| |||||||||||| |||||| ||||||||||||| BTDnaString seq ("CATGTCAGCT""GATATAGCGCGCT" "GCATCAATTTGTGTGTAAAC", true); // 0123456789 0123456789012 34567890123456789012 BTString qual("ABCDEFGHIa""bcdefghiACDEF" "GHIabcdefghijkABCDEF"); for(size_t j = 0; j < seq.length(); j++) { // Assume left-to-right size_t beg = j; size_t end = j + GFM::default_ftabChars; // Mismatch penalty is 3, so we have to skip starting // points that are within 2 from the mismatch if((i > 0 && j > 0) || j == seq.length()-1) { // Right-to-left if(beg < GFM::default_ftabChars) { beg = 0; } else { beg -= GFM::default_ftabChars; } end -= GFM::default_ftabChars; } if(beg <= 10 && end >= 10) { continue; } if(beg <= 22 && end >= 22) { continue; } if(beg <= 30 && end >= 30) { continue; } cerr << "Test " << (++testnum) << endl; cerr << " Query matches once with a read gap of length 1" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -50, 50); // Set up the DescentConfig DescentConfig conf; // Changed conf.cons.init(1, 0.5); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? true, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(5, dr.sink().nrange()); assert_eq(0, dr.sink()[0].pen); assert_eq(min(sc.readGapOpen(), sc.refGapOpen()) + sc.mm((int)'d' - 33), dr.sink()[1].pen); assert_eq(max(sc.readGapOpen(), sc.refGapOpen()) + sc.mm((int)'d' - 33), dr.sink()[2].pen); assert_eq(sc.readGapOpen() + sc.refGapOpen(), dr.sink()[3].pen); assert_eq(sc.readGapOpen() + sc.refGapOpen() + sc.mm((int)'d' - 33), dr.sink()[4].pen); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl; assert_eq(5, dr.sink().nelt()); last_topf = dr.sink()[0].topf; last_botf = dr.sink()[0].botf; } } } // Query is longer than ftab and matches exactly once with one read gap, // one ref gap, one mismatch, and one N { size_t last_topf = std::numeric_limits<size_t>::max(); size_t last_botf = std::numeric_limits<size_t>::max(); for(int i = 0; i < 2; i++) { // Set up the read // Ref: CATGTCAGCT ATATAGCGCGCT CGCATCATTTTGTGTGTAAACCA // |||||||||| |||||||||||| |||||| |||||| |||||| BTDnaString seq ("CATGTCAGCT""GATATAGCGCGCT" "GCATCAATTTGTGNGTAAAC", true); // 0123456789 0123456789012 34567890123456789012 BTString qual("ABCDEFGHIa""bcdefghiACDEF" "GHIabcdefghijkABCDEF"); for(size_t j = 0; j < seq.length(); j++) { // Assume left-to-right size_t beg = j; size_t end = j + GFM::default_ftabChars; // Mismatch penalty is 3, so we have to skip starting // points that are within 2 from the mismatch if((i > 0 && j > 0) || j == seq.length()-1) { // Right-to-left if(beg < GFM::default_ftabChars) { beg = 0; } else { beg -= GFM::default_ftabChars; } end -= GFM::default_ftabChars; } if(beg <= 10 && end >= 10) { continue; } if(beg <= 22 && end >= 22) { continue; } if(beg <= 30 && end >= 30) { continue; } if(beg <= 36 && end >= 36) { continue; } cerr << "Test " << (++testnum) << endl; cerr << " Query matches with various patterns of gaps, mismatches and Ns" << endl; DescentMetrics mets; PerReadMetrics prm; DescentDriver dr; dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -50, 50); // Set up the DescentConfig DescentConfig conf; // Changed conf.cons.init(1, 0.5); conf.expol = DESC_EX_NONE; // Set up the search roots dr.addRoot( conf, // DescentConfig j, // 5' offset into read of root i == 0, // left-to-right? true, // forward? 0.0f); // root priority // Do the search Scoring sc = Scoring::base1(); sc.setNPen(COST_MODEL_CONSTANT, 1); dr.go(sc, *gfms.first, *gfms.second, mets, prm); // Confirm that an exact-matching alignment was found assert_eq(5, dr.sink().nrange()); assert_eq(sc.n(40), dr.sink()[0].pen); assert_eq(sc.n(40) + min(sc.readGapOpen(), sc.refGapOpen()) + sc.mm((int)'d' - 33), dr.sink()[1].pen); assert_eq(sc.n(40) + max(sc.readGapOpen(), sc.refGapOpen()) + sc.mm((int)'d' - 33), dr.sink()[2].pen); assert_eq(sc.n(40) + sc.readGapOpen() + sc.refGapOpen(), dr.sink()[3].pen); assert_eq(sc.n(40) + sc.readGapOpen() + sc.refGapOpen() + sc.mm((int)'d' - 33), dr.sink()[4].pen); assert(last_topf == std::numeric_limits<size_t>::max() || last_topf == dr.sink()[0].topf); assert(last_botf == std::numeric_limits<size_t>::max() || last_botf == dr.sink()[0].botf); cerr << dr.sink()[0].topf << ", " << dr.sink()[0].botf << endl; assert_eq(5, dr.sink().nelt()); last_topf = dr.sink()[0].topf; last_botf = dr.sink()[0].botf; } } } delete gfms.first; delete gfms.second; cerr << "DONE" << endl; }
/** * Merman main driver function. Does the following: * * 1. Parses command-line options */ int merman(int argc, char **argv) { reset(); try { parseCommandLine(argc, argv); Timer tov(cerr, "Overall time: ", timing); EList<string> refstrs; ReferenceSet refs; EList<string> refnames; EList<size_t> reflens; string refstr = argv[optind++]; tokenize(refstr, ",", refstrs); auto_ptr<MerIndex> ind( new MerIndex(ap, rp, readLen, seedWidth, nk.first, nk.second, specificity, begin, naiveCheck, nthreads)); { Timer t(cerr, "... ", timing); if(timing) cerr << "Reading reference sequences..." << endl; for(size_t i = 0; i < refstrs.size(); i++) { if(timing) { cerr << " Sequence " << (i+1) << " of " << refstrs.size() << endl; } if(refIsStr) { refs.addOrigReferenceString(refstrs[i].c_str(), rp); } else { refs.addOrigReferenceFasta(refstrs[i].c_str(), rp); } } for(size_t i = 0; i < refs.numRefs(); i++) { refnames.push_back(string(refs[i].name.toZBuf())); reflens.push_back(refs[i].seq.length(color)); } if(refs.numRefs() == 0) { cerr << "Warning: No references were found" << endl; } if(rp.genCrick) { if(timing) { cerr << " Crickizing" << endl; } // Add the crick strand. If there were bisulfite // transformations to the Watson strand, they are // removed from the Watson strand before the Crick copy // is made. Transformations are then applied to the // new Crick strand. This has the effect of correctly // producing either Watson / Crick in the non-bisulfite // case, or BS Watson / BS Crick in the bisulfite case. refs.addReferenceRevComps(rp, false, 1, 0); } if(rp.genRevcomps) { if(timing) { cerr << " Adding reverse comps" << endl; } // Add reverse complements of all existing references // (after the transformations have already been // applied). refs.addReferenceRevComps(rp, true, -1, 1); } assert(refs.repOk()); } pair<size_t, size_t> mers = make_pair(0, 0); EList<MerIndexThread> threads; { Timer t(cerr, "... ", timing); if(timing) cerr << "Preparing to extract sub-sequences..." << endl; // Instantiate and run index threads assert_gt(nthreads, 0); threads.resize(nthreads); for(int i = 0; i < nthreads; i++) { threads[i].runCount(&refs, ind.get(), i, nthreads, color); } for(int i = 0; i < nthreads; i++) { pair<size_t, size_t> mrs = threads[i].join(); mers.first += mrs.first; mers.second += mrs.second; } ind->allocateMers(); } if(timing || verbose || justBlowup) { cerr << "Expecting index footprint of "; printBytes(mers.first * sizeof(mer_ent), cerr); cerr << endl; if(mers.first > mers.second) { cerr.setf(ios::fixed); cerr << " base footprint is "; printBytes(mers.second * sizeof(mer_ent), cerr); cerr << endl << " blowup factor: " << setprecision(2) << ((double)mers.first / (double)mers.second) << endl; } if(justBlowup) throw 0; } { Timer t(cerr, "... ", timing); if(timing) cerr << "Extracting index sub-sequences..." << endl; // Instantiate and run index threads for(int i = 0; i < nthreads; i++) { threads[i].runIndex(&refs, ind.get(), i, nthreads, color); } for(int i = 0; i < nthreads; i++) threads[i].join(); } assert_eq(mers.first, ind->size()); if(verbose) { cout << " read " << refs.numRefs() << " reference strings" << endl; } if(refs.empty() && iformat != INPUT_CHAININ) { cerr << "Index is empty; not enough reference sequence supplied" << endl; throw 1; } if(refs.numRefs() == 0 && iformat != INPUT_CHAININ) { cerr << "No reference strings provided; aborting..." << endl; throw 1; } { Timer t(cerr, "Sorting reference mers: ", timing); ind->sort(nthreads); // sort mers } { Timer t(cerr, "... ", timing); if(timing) cerr << "Aligning reads..." << endl; string rstr = argv[optind++]; // Instantiate reference map, which translates to new reference // coordinate system prior to alignment output auto_ptr<ReferenceMap> rmap( refmapFile == NULL ? NULL : new ReferenceMap(refmapFile, !refidx)); // Instantiate annotation map, which encodes SNP locations & alleles auto_ptr<AnnotationMap> amap( annotFile == NULL ? NULL : new AnnotationMap(annotFile)); // Instantiate the read-input object auto_ptr<Reads> rs( (iformat == INPUT_CMDLINE) ? (Reads*)new StringReads(rstr, begin) : ((iformat == INPUT_FASTA) ? (Reads*)new FastaReads(rstr, begin, bufsz) : ((iformat == INPUT_FASTA_CONT) ? (Reads*)new FastaContinuousReads( rstr, begin, fastaContLen, fastaContFreq, fcontBis, fcontRc, color) : ((iformat == INPUT_FASTQ) ? (Reads*)new FastqReads(rstr, solexaScale, sixty4off, begin, bufsz) : ((iformat == INPUT_CHAININ) ? (Reads*)new ChainReads(rstr, begin, bufsz) : ((iformat == INPUT_CSFASTA) ? (Reads*)new CSFastaReads(rstr, begin, bufsz) : ((iformat == INPUT_CSFASTA_AND_QV) ? (Reads*)new CSFastaAndQVReads(rstr, qualFile, begin, bufsz) : (Reads*)new CSFastqReads(rstr, solexaScale, sixty4off, begin, bufsz)))))))); // Set output stream string of = "-"; if(optind < argc) of = argv[optind++]; // Instantiate the alignment-output object auto_ptr<AlignOutput> outs( (oformat == OUTPUT_SAM) ? (AlignOutput*)new SamOutput(of, fullref, refidx, rp.bisulfiteC || rp.bisulfiteCpG, !samNoCsCq) : (AlignOutput*)new BowtieOutput(of, fullref, printCost, refidx, rp.bisulfiteC || rp.bisulfiteCpG)); outs->printHeader(refnames, reflens); // Run the progress thread, if requested ProgressThread proThread; if(progress) proThread.run(); // Instantiate and run search threads EList<SearchThread> sthreads; sthreads.resize(nthreads); for(int i = 0; i < (int)sthreads.size(); i++) { sthreads[i].init( i, (int)sthreads.size(), ind.get(), rs.get(), &refs, outs.get(), rmap.get(), amap.get()); sthreads[i].run(); } // Wait until search sthreads are finished for(size_t i = 0; i < sthreads.size(); i++) { sthreads[i].join(); } if(progress) { proThread.kill(); proThread.join(); } outs->flush(); } if(!quiet) ProgressThread::reportStats(); } catch(exception& e) { cerr << "Command: "; for(int i = 0; i < argc; i++) cerr << argv[i] << " "; cerr << endl; return 1; } catch(int e) { if(e != 0) { cerr << "Command: "; for(int i = 0; i < argc; i++) cerr << argv[i] << " "; cerr << endl; } return e; } return 0; }
static void driver( const char * type, const string& bt2indexBase, const string& cf_out) { if(gVerbose || startVerbose) { cerr << "Entered driver(): "; logTime(cerr, true); } //initializeCntLut(); // FB: test commenting // Vector of the reference sequences; used for sanity-checking EList<SString<char> > names, os; EList<size_t> nameLens, seqLens; // Initialize Ebwt object and read in header if(gVerbose || startVerbose) { cerr << "About to initialize fw Ebwt: "; logTime(cerr, true); } adjIdxBase = adjustEbwtBase(argv0, bt2indexBase, gVerbose); Ebwt<index_t> ebwt( adjIdxBase, 0, // index is colorspace -1, // fw index true, // index is for the forward direction /* overriding: */ offRate, 0, // amount to add to index offrate or <= 0 to do nothing useMm, // whether to use memory-mapped files useShmem, // whether to use shared memory mmSweep, // sweep memory-mapped files !noRefNames, // load names? true, // load SA sample? true, // load ftab? true, // load rstarts? gVerbose, // whether to be talkative startVerbose, // talkative during initialization false /*passMemExc*/, sanityCheck); //Ebwt<index_t>* ebwtBw = NULL; EList<size_t> reflens; EList<string> refnames; readEbwtRefnames<index_t>(adjIdxBase, refnames); map<uint32_t,pair<string,uint64_t> > speciesID_to_name_len; for(size_t i = 0; i < ebwt.nPat(); i++) { // cerr << "Push back to reflens: "<< refnames[i] << " is so long: " << ebwt.plen()[i] << endl; reflens.push_back(ebwt.plen()[i]); // extract numeric id from refName const string& refName = refnames[i]; uint64_t id = extractIDFromRefName(refName); uint32_t speciesID = (uint32_t)(id >> 32); // extract name from refName const string& name_part = refName.substr(refName.find_first_of(' ')); //uint32_t genusID = (uint32_t)(id & 0xffffffff); speciesID_to_name_len[speciesID] = pair<string,uint64_t>(name_part,ebwt.plen()[i]); } // EList<string> refnames; // readEbwtRefnames<index_t>(adjIdxBase, refnames); // Read Centrifuge output file ifstream infile(cf_out.c_str()); string line; map<uint32_t,uint32_t> species_to_score; while (getline(infile,line)) { string rd_name; uint32_t genusID; uint32_t speciesID; uint32_t score; uint32_t secbest_score; istringstream iss(line); iss >> rd_name >> genusID >> speciesID >> score >> secbest_score; // cerr << rd_name << " -> " << genusID << " -> " << speciesID << " -> " << score << " -> " << secbest_score << "\n"; species_to_score[speciesID] += score; } // Sort the species by their score vector<pair<uint32_t,uint32_t> > species_to_score_v(species_to_score.begin(), species_to_score.end()); sort(species_to_score_v.begin(),species_to_score_v.end(),Pair2ndComparator<uint32_t>()); cout << "Name\tTaxonID\tLength\tSummed Score\tNormalized Score\n"; // Output the summed species scores for (vector<pair<uint32_t,uint32_t> >::iterator species_score = species_to_score_v.begin(); species_score != species_to_score_v.end(); ++species_score) { uint32_t speciesID = species_score->first; pair<string,uint64_t> name_len = speciesID_to_name_len[speciesID]; uint64_t slength = name_len.second; uint64_t sumscore = species_score->second; cout << name_len.first << "\t" << speciesID << "\t" << slength << "\t" << sumscore << "\t" << (float)sumscore/slength << "\n"; } }
/** * Given the values for all of the various arguments used to specify * the read and quality input, create a list of pattern sources to * dispense them. */ PairedPatternSource* PairedPatternSource::setupPatternSources( const EList<string>& si, // singles, from argv const EList<string>& m1, // mate1's, from -1 arg const EList<string>& m2, // mate2's, from -2 arg const EList<string>& m12, // both mates on each line, from --12 arg const EList<string>& q, // qualities associated with singles const EList<string>& q1, // qualities associated with m1 const EList<string>& q2, // qualities associated with m2 const PatternParams& p, // read-in parameters bool verbose) // be talkative? { //std::cout << "setupPatternSources\n"; EList<PatternSource*>* a = new EList<PatternSource*>(); EList<PatternSource*>* b = new EList<PatternSource*>(); EList<PatternSource*>* ab = new EList<PatternSource*>(); // Create list of pattern sources for paired reads appearing // interleaved in a single file for(size_t i = 0; i < m12.size(); i++) { const EList<string>* qs = &m12; EList<string> tmp; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmp; tmp.push_back(m12[i]); assert_eq(1, tmp.size()); } ab->push_back(PatternSource::patsrcFromStrings(p, *qs)); if(!p.fileParallel) { break; } } // Create list of pattern sources for paired reads for(size_t i = 0; i < m1.size(); i++) { const EList<string>* qs = &m1; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(m1[i]); assert_eq(1, tmpSeq.size()); } a->push_back(PatternSource::patsrcFromStrings(p, *qs)); if(!p.fileParallel) { break; } } // Create list of pattern sources for paired reads for(size_t i = 0; i < m2.size(); i++) { const EList<string>* qs = &m2; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(m2[i]); assert_eq(1, tmpSeq.size()); } b->push_back(PatternSource::patsrcFromStrings(p, *qs)); if(!p.fileParallel) { break; } } // All mates/mate files must be paired assert_eq(a->size(), b->size()); // Create list of pattern sources for the unpaired reads for(size_t i = 0; i < si.size(); i++) { const EList<string>* qs = &si; PatternSource* patsrc = NULL; EList<string> tmpSeq; EList<string> tmpQual; if(p.fileParallel) { // Feed query files one to each PatternSource qs = &tmpSeq; tmpSeq.push_back(si[i]); assert_eq(1, tmpSeq.size()); } patsrc = PatternSource::patsrcFromStrings(p, *qs); assert(patsrc != NULL); a->push_back(patsrc); b->push_back(NULL); if(!p.fileParallel) { break; } } PairedPatternSource *patsrc = NULL; if(m12.size() > 0) { patsrc = new PairedSoloPatternSource(ab, p); for(size_t i = 0; i < a->size(); i++) delete (*a)[i]; for(size_t i = 0; i < b->size(); i++) delete (*b)[i]; delete a; delete b; } else { patsrc = new PairedDualPatternSource(a, b, p); for(size_t i = 0; i < ab->size(); i++) delete (*ab)[i]; delete ab; } return patsrc; }
/** * Reverse the 'src' list of RefRecords into the 'dst' list. Don't * modify 'src'. */ void reverseRefRecords( const EList<RefRecord>& src, EList<RefRecord>& dst, bool recursive, bool verbose) { dst.clear(); { EList<RefRecord> cur; for(int i = (int)src.size()-1; i >= 0; i--) { bool first = (i == (int)src.size()-1 || src[i+1].first); // Clause after the || on next line is to deal with empty FASTA // records at the end of the 'src' list, which would be wrongly // omitted otherwise. if(src[i].len || (first && src[i].off == 0)) { cur.push_back(RefRecord(0, src[i].len, first)); first = false; } if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first)); } bool mergedLast; for(int i = 0; i < (int)cur.size(); i++) { mergedLast = false; assert(cur[i].off == 0 || cur[i].len == 0); if(i < (int)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) { dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first)); i++; mergedLast = true; } else { dst.push_back(cur[i]); } } } //if(verbose) { // cout << "Source: " << endl; // printRecords(cout, src); // cout << "Dest: " << endl; // printRecords(cout, dst); //} #ifndef NDEBUG size_t srcnfirst = 0, dstnfirst = 0; for(size_t i = 0; i < src.size(); i++) { if(src[i].first) { srcnfirst++; } } for(size_t i = 0; i < dst.size(); i++) { if(dst[i].first) { dstnfirst++; } } assert_eq(srcnfirst, dstnfirst); if(!recursive) { EList<RefRecord> tmp; reverseRefRecords(dst, tmp, true); assert_eq(tmp.size(), src.size()); for(size_t i = 0; i < src.size(); i++) { assert_eq(src[i].len, tmp[i].len); assert_eq(src[i].off, tmp[i].off); assert_eq(src[i].first, tmp[i].first); } } #endif }
int main(void) { cerr << "Test inter-class comparison operators..."; { SString<int> s(2); s.set('a', 0); s.set('b', 1); assert(sstr_eq(s, (const char *)"ab")); assert(!sstr_neq(s, (const char *)"ab")); assert(!sstr_lt(s, (const char *)"ab")); assert(!sstr_gt(s, (const char *)"ab")); assert(sstr_leq(s, (const char *)"ab")); assert(sstr_geq(s, (const char *)"ab")); SStringExpandable<int> s2; s2.append('a'); s2.append('b'); assert(sstr_eq(s, s2)); assert(sstr_eq(s2, (const char *)"ab")); assert(!sstr_neq(s, s2)); assert(!sstr_neq(s2, (const char *)"ab")); assert(!sstr_lt(s, s2)); assert(!sstr_lt(s2, (const char *)"ab")); assert(!sstr_gt(s, s2)); assert(!sstr_gt(s2, (const char *)"ab")); assert(sstr_leq(s, s2)); assert(sstr_leq(s2, (const char *)"ab")); assert(sstr_geq(s, s2)); assert(sstr_geq(s2, (const char *)"ab")); SStringFixed<int, 12> s3; s3.append('a'); s3.append('b'); assert(sstr_eq(s, s3)); assert(sstr_eq(s2, s3)); assert(sstr_eq(s3, (const char *)"ab")); assert(!sstr_neq(s, s3)); assert(!sstr_neq(s2, s3)); assert(!sstr_neq(s3, (const char *)"ab")); assert(!sstr_lt(s, s3)); assert(!sstr_lt(s2, s3)); assert(!sstr_lt(s3, (const char *)"ab")); assert(!sstr_gt(s, s3)); assert(!sstr_gt(s2, s3)); assert(!sstr_gt(s3, (const char *)"ab")); assert(sstr_geq(s, s3)); assert(sstr_geq(s2, s3)); assert(sstr_geq(s3, (const char *)"ab")); assert(sstr_leq(s, s3)); assert(sstr_leq(s2, s3)); assert(sstr_leq(s3, (const char *)"ab")); } cerr << "PASSED" << endl; cerr << "Test flag for whether to consider end-of-word < other chars ..."; { SString<char> ss("String"); SString<char> sl("String1"); assert(sstr_lt(ss, sl)); assert(sstr_gt(ss, sl, false)); assert(sstr_leq(ss, sl)); assert(sstr_geq(ss, sl, false)); } cerr << "PASSED" << endl; cerr << "Test toZBuf and toZBufXForm ..."; { SString<uint32_t> s(10); for(int i = 0; i < 10; i++) { s[i] = (uint32_t)i; } assert(strcmp(s.toZBufXForm("0123456789"), "0123456789") == 0); } cerr << "PASSED" << endl; cerr << "Test S2bDnaString ..."; { const char *str = "ACGTACGTAC" "ACGTACGTAC" "ACGTACGTAC" "ACGTACGTAC" "ACGTACGTAC" "ACGTACGTAC"; const char *gs = "GGGGGGGGGG" "GGGGGGGGGG" "GGGGGGGGGG" "GGGGGGGGGG" "GGGGGGGGGG" "GGGGGGGGGG"; for(size_t i = 0; i < 60; i++) { S2bDnaString s(str, i, true); S2bDnaString sr; BTDnaString s2(str, i, true); assert(sstr_eq(s, s2)); if(i >= 10) { BTDnaString s3; s.windowGetDna(s3, true, 3, 4); assert(sstr_eq(s3.toZBuf(), (const char*)"TACG")); s.windowGetDna(s3, false, 3, 4); assert(sstr_eq(s3.toZBuf(), (const char*)"CGTA")); assert_eq('A', s.toChar(0)); assert_eq('G', s.toChar(2)); assert_eq('A', s.toChar(4)); assert_eq('G', s.toChar(6)); assert_eq('A', s.toChar(8)); s.reverseWindow(1, 8); s2.reverseWindow(1, 8); assert_eq('A', s.toChar(1)); assert_eq('T', s.toChar(2)); assert_eq('G', s.toChar(3)); assert_eq('C', s.toChar(4)); assert_eq('A', s.toChar(5)); assert_eq('T', s.toChar(6)); assert_eq('G', s.toChar(7)); assert_eq('C', s.toChar(8)); assert(sstr_eq(s, s2)); s.reverseWindow(1, 8); s2.reverseWindow(1, 8); assert(sstr_eq(s, s2)); } if(i > 1) { s.reverse(); sr.installReverseChars(str, i); s2.reverse(); assert(sstr_eq(s, s2)); assert(sstr_eq(sr, s2)); s.reverse(); sr.reverse(); assert(sstr_neq(s, s2)); assert(sstr_neq(sr, s2)); s.fill(2); s2.reverse(); assert(sstr_leq(s, gs)); assert(sstr_gt(s, s2)); assert(sstr_gt(s, sr)); s2.fill(2); sr.fill(2); assert(sstr_eq(s, s2)); assert(sstr_eq(s, sr)); } } S2bDnaString s(str, true); S2bDnaString sr; BTDnaString s2(str, true); assert(sstr_eq(s2.toZBuf(), str)); assert(sstr_eq(s, s2)); s.reverse(); sr.installReverseChars(str); s2.reverse(); assert(sstr_eq(s, s2)); assert(sstr_eq(sr, s2)); s.reverse(); sr.reverse(); assert(sstr_neq(s, s2)); assert(sstr_neq(sr, s2)); } cerr << "PASSED" << endl; cerr << "Test operator=() ..."; { S2bDnaString s; s.installChars(string("gtcagtca")); assert(sstr_eq(s.toZBuf(), (const char *)"GTCAGTCA")); } cerr << "PASSED" << endl; cerr << "Conversions from string ..."; { SStringExpandable<char> se(string("hello")); EList<SStringExpandable<char> > sel; sel.push_back(SStringExpandable<char>(string("hello"))); } cerr << "PASSED" << endl; cerr << "PASSED" << endl; }