Пример #1
0
/**
 * Bowtie main function.  It is placed in a separate source file to
 * make it slightly easier to compile Bowtie as a library.
 *
 * If the user specifies -A <file> as the first two arguments, main
 * will interpret that file as having one set of command-line arguments
 * per line, and will dispatch each batch of arguments one at a time to
 * bowtie.
 */
int main(int argc, const char **argv) {
	if(argc > 2 && strcmp(argv[1], "-A") == 0) {
		const char *file = argv[2];
		ifstream in;
		in.open(file);
		char buf[4096];
		int lastret = -1;
		while(in.getline(buf, 4095)) {
			EList<string> args;
			args.push_back(string(argv[0]));
			tokenize(buf, " \t", args);
			const char **myargs = (const char**)malloc(sizeof(char*)*args.size());
			for(size_t i = 0; i < args.size(); i++) {
				myargs[i] = args[i].c_str();
			}
			if(args.size() == 1) continue;
			lastret = hisat2((int)args.size(), myargs);
			free(myargs);
		}
		if(lastret == -1) {
			cerr << "Warning: No arg strings parsed from " << file << endl;
			return 0;
		}
		return lastret;
	} else {
		return hisat2(argc, argv);
	}
}
Пример #2
0
/**
 * Calculate a vector containing the sizes of all of the patterns in
 * all of the given input files, in order.  Returns the total size of
 * all references combined.  Rewinds each istream before returning.
 */
std::pair<size_t, size_t>
fastaRefReadSizes(
	EList<FileBuf*>& in,
	EList<RefRecord>& recs,
	const RefReadInParams& rparms,
	BitpairOutFileBuf* bpout,
	int& numSeqs)
{
	uint32_t unambigTot = 0;
	uint32_t bothTot = 0;
	RefReadInParams rpcp = rparms;
	assert_gt(in.size(), 0);
	// For each input istream
	for(size_t i = 0; i < in.size(); i++) {
		bool first = true;
		assert(!in[i]->eof());
		// For each pattern in this istream
		while(!in[i]->eof()) {
			RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout);
			if((unambigTot + rec.len) < unambigTot) {
				cerr << "Error: Reference sequence has more than 2^32-1 characters!  Please divide the" << endl
				     << "reference into batches or chunks of about 3.6 billion characters or less each" << endl
				     << "and index each independently." << endl;
				throw 1;
			}
			// Add the length of this record.
			if(rec.first) numSeqs++;
			unambigTot += rec.len;
			bothTot += rec.len;
			bothTot += rec.off;
			first = false;
			if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
			recs.push_back(rec);
		}
		// Reset the input stream
		in[i]->reset();
		assert(!in[i]->eof());
#ifndef NDEBUG
		// Check that it's really reset
		int c = in[i]->get();
		assert_eq('>', c);
		in[i]->reset();
		assert(!in[i]->eof());
#endif
	}
	assert_geq(bothTot, 0);
	assert_geq(unambigTot, 0);
	return make_pair(
		unambigTot, // total number of unambiguous DNA characters read
		bothTot); // total number of DNA characters read, incl. ambiguous ones
}
Пример #3
0
/**
 * Given a list of edits and a DNA string representing the query
 * sequence, check that the edits are consistent with respect to the
 * query.
 */
bool Edit::repOk(
	const EList<Edit>& edits,
	const BTDnaString& s,
	bool fw,
	size_t trimBeg,
	size_t trimEnd)
{
	if(!fw) {
		invertPoss(const_cast<EList<Edit>&>(edits), s.length()-trimBeg-trimEnd, false);
		swap(trimBeg, trimEnd);
	}
	for(size_t i = 0; i < edits.size(); i++) {
		const Edit& e = edits[i];
		size_t pos = e.pos;
		if(i > 0) {
			assert_geq(pos, edits[i-1].pos);
		}
		bool del = false, mm = false;
		while(i < edits.size() && edits[i].pos == pos) {
			const Edit& ee = edits[i];
			assert_lt(ee.pos, s.length());
            if(ee.type != EDIT_TYPE_SPL) {
                if(ee.qchr != '-') {
                    assert(ee.isRefGap() || ee.isMismatch());
                    assert_eq((int)ee.qchr, s.toChar(ee.pos+trimBeg));
                }
            }
			if(ee.isMismatch()) {
				assert(!mm);
				mm = true;
				assert(!del);
			} else if(ee.isReadGap()) {
				assert(!mm);
			} else if(ee.isRefGap()) {
				assert(!mm);
				assert(!del);
				del = true;
			} else if(ee.isSpliced()) {
                
            }
			i++;
		}
	}
	if(!fw) {
		invertPoss(const_cast<EList<Edit>&>(edits), s.length()-trimBeg-trimEnd, false);
	}
	return true;
}
Пример #4
0
/**
 * Create a BitPairReference encapsulating the reference portion of the
 * index at the given basename.  Iterate through the reference
 * sequences, sending each one to print_ref_sequence to print.
 */
static void print_ref_sequences(
    ostream& fout,
    bool color,
    const EList<string>& refnames,
    const uint32_t* plen,
    const string& adjustedEbwtFileBase)
{
    BitPairReference ref(
        adjustedEbwtFileBase, // input basename
        color,                // true -> expect colorspace reference
        false,                // sanity-check reference
        NULL,                 // infiles
        NULL,                 // originals
        false,                // infiles are sequences
        false,                // memory-map
        false,                // use shared memory
        false,                // sweep mm-mapped ref
        verbose,              // be talkative
        verbose);             // be talkative at startup
    assert_eq(ref.numRefs(), refnames.size());
    for(size_t i = 0; i < ref.numRefs(); i++) {
        print_ref_sequence(
            fout,
            ref,
            refnames[i],
            i,
            plen[i] + (color ? 1 : 0));
    }
}
Пример #5
0
static void print_index_sequences(ostream& fout, Ebwt& ebwt)
{
    EList<string>* refnames = &(ebwt.refnames());

    TStr cat_ref;
    ebwt.restore(cat_ref);

    uint32_t curr_ref = 0xffffffff;
    string curr_ref_seq = "";
    uint32_t curr_ref_len = 0xffffffff;
    uint32_t last_text_off = 0;
    size_t orig_len = cat_ref.length();
    uint32_t tlen = 0xffffffff;
    bool first = true;
    for(size_t i = 0; i < orig_len; i++) {
        uint32_t tidx = 0xffffffff;
        uint32_t textoff = 0xffffffff;
        tlen = 0xffffffff;
        bool straddled = false;
        ebwt.joinedToTextOff(1 /* qlen */, (uint32_t)i, tidx, textoff, tlen, true, straddled);

        if (tidx != 0xffffffff && textoff < tlen)
        {
            if (curr_ref != tidx)
            {
                if (curr_ref != 0xffffffff)
                {
                    // Add trailing gaps, if any exist
                    if(curr_ref_seq.length() < curr_ref_len) {
                        curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
                    }
                    print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
                }
                curr_ref = tidx;
                curr_ref_seq = "";
                curr_ref_len = tlen;
                last_text_off = 0;
                first = true;
            }

            uint32_t textoff_adj = textoff;
            if(first && textoff > 0) textoff_adj++;
            if (textoff_adj - last_text_off > 1)
                curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N');

            curr_ref_seq.push_back(cat_ref[i]);
            last_text_off = textoff;
            first = false;
        }
    }
    if (curr_ref < refnames->size())
    {
        // Add trailing gaps, if any exist
        if(curr_ref_seq.length() < curr_ref_len) {
            curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
        }
        print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
    }

}
Пример #6
0
/**
 * Clip off some of the high-numbered positions.
 */
void Edit::clipHi(EList<Edit>& ed, size_t len, size_t amt) {
	assert_leq(amt, len);
	size_t max = len - amt;
	size_t nrm = 0;
	for(size_t i = 0; i < ed.size(); i++) {
		size_t ii = ed.size() - i - 1;
		assert_lt(ed[ii].pos, len);
		if(ed[ii].pos > max) {
			nrm++;
		} else if(ed[ii].pos == max && !ed[ii].isReadGap()) {
			nrm++;
		} else {
			break;
		}
	}
	ed.resize(ed.size() - nrm);
}
Пример #7
0
/**
 * Delete all the index files that we tried to create.  For when we had to
 * abort the index-building process due to an error.
 */
static void deleteIdxFiles(
	const string& outfile,
	bool doRef,
	bool justRef)
{
	
	for(size_t i = 0; i < filesWritten.size(); i++) {
		cerr << "Deleting \"" << filesWritten[i].c_str()
		     << "\" file written during aborted indexing attempt." << endl;
		remove(filesWritten[i].c_str());
	}
}
Пример #8
0
/**
 * Merge second argument into the first.  Assume both are sorted to
 * begin with.
 */
void Edit::merge(EList<Edit>& dst, const EList<Edit>& src) {
	size_t di = 0, si = 0;
	while(di < dst.size()) {
		if(src[si].pos < dst[di].pos) {
			dst.insert(src[si], di);
			si++; di++;
		} else if(src[si].pos == dst[di].pos) {
			// There can be two inserts at a given position, but we
			// can't merge them because there's no way to know their
			// order
			assert(src[si].isReadGap() != dst[di].isReadGap());
			if(src[si].isReadGap()) {
				dst.insert(src[si], di);
				si++; di++;
			} else if(dst[di].isReadGap()) {
				di++;
			}
		}
	}
	while(si < src.size()) dst.push_back(src[si++]);
}
Пример #9
0
/**
 * Check that this Ebwt, when restored via restore(), matches up with
 * the given array of reference sequences.  For sanity checking.
 */
void Ebwt::checkOrigs(
	const EList<SString<char> >& os,
	bool color,
	bool mirror) const
{
	SString<char> rest;
	restore(rest);
	uint32_t restOff = 0;
	size_t i = 0, j = 0;
	if(mirror) {
		// TODO: FIXME
		return;
	}
	while(i < os.size()) {
		size_t olen = os[i].length();
		int lastorig = -1;
		for(; j < olen; j++) {
			size_t joff = j;
			if(mirror) joff = olen - j - 1;
			if((int)os[i][joff] == 4) {
				// Skip over Ns
				lastorig = -1;
				if(!mirror) {
					while(j < olen && (int)os[i][j] == 4) j++;
				} else {
					while(j < olen && (int)os[i][olen-j-1] == 4) j++;
				}
				j--;
				continue;
			}
			if(lastorig == -1 && color) {
				lastorig = os[i][joff];
				continue;
			}
			if(color) {
				assert_neq(-1, lastorig);
				assert_eq(dinuc2color[(int)os[i][joff]][lastorig], rest[restOff]);
			} else {
				assert_eq(os[i][joff], rest[restOff]);
			}
			lastorig = (int)os[i][joff];
			restOff++;
		}
		if(j == os[i].length()) {
			// Moved to next sequence
			i++;
			j = 0;
		} else {
			// Just jumped over a gap
		}
	}
}
Пример #10
0
/**
 * Clip off some of the low-numbered positions.
 */
void Edit::clipLo(EList<Edit>& ed, size_t len, size_t amt) {
	size_t nrm = 0;
	for(size_t i = 0; i < ed.size(); i++) {
		assert_lt(ed[i].pos, len);
		if(ed[i].pos < amt) {
			nrm++;
		} else {
			// Shift everyone else up
			ed[i].pos -= (uint32_t)amt;
		}
	}
	ed.erase(0, nrm);
}
Пример #11
0
/**
 * Given the values for all of the various arguments used to specify
 * the read and quality input, create a list of pattern sources to
 * dispense them.
 */
PairedPatternSource* PairedPatternSource::setupPatternSources(
	const EList<string>& si,   // singles, from argv
	const EList<string>& m1,   // mate1's, from -1 arg
	const EList<string>& m2,   // mate2's, from -2 arg
	const EList<string>& m12,  // both mates on each line, from --12 arg
#ifdef USE_SRA
    const EList<string>& sra_accs,
#endif
	const EList<string>& q,    // qualities associated with singles
	const EList<string>& q1,   // qualities associated with m1
	const EList<string>& q2,   // qualities associated with m2
	const PatternParams& p,    // read-in parameters
    size_t nthreads,
	bool verbose)              // be talkative?
{
	EList<PatternSource*>* a  = new EList<PatternSource*>();
	EList<PatternSource*>* b  = new EList<PatternSource*>();
	EList<PatternSource*>* ab = new EList<PatternSource*>();
	// Create list of pattern sources for paired reads appearing
	// interleaved in a single file
	for(size_t i = 0; i < m12.size(); i++) {
		const EList<string>* qs = &m12;
		EList<string> tmp;
		if(p.fileParallel) {
			// Feed query files one to each PatternSource
			qs = &tmp;
			tmp.push_back(m12[i]);
			assert_eq(1, tmp.size());
		}
		ab->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads));
		if(!p.fileParallel) {
			break;
		}
	}
    
#ifdef USE_SRA
    for(size_t i = 0; i < sra_accs.size(); i++) {
        const EList<string>* qs = &sra_accs;
        EList<string> tmp;
        if(p.fileParallel) {
            // Feed query files one to each PatternSource
            qs = &tmp;
            tmp.push_back(sra_accs[i]);
            assert_eq(1, tmp.size());
        }
        ab->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads));
        if(!p.fileParallel) {
            break;
        }
    }
#endif

	// Create list of pattern sources for paired reads
	for(size_t i = 0; i < m1.size(); i++) {
		const EList<string>* qs = &m1;
		EList<string> tmpSeq;
		EList<string> tmpQual;
		if(p.fileParallel) {
			// Feed query files one to each PatternSource
			qs = &tmpSeq;
			tmpSeq.push_back(m1[i]);
			assert_eq(1, tmpSeq.size());
		}
		a->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads));
		if(!p.fileParallel) {
			break;
		}
	}

	// Create list of pattern sources for paired reads
	for(size_t i = 0; i < m2.size(); i++) {
		const EList<string>* qs = &m2;
		EList<string> tmpSeq;
		EList<string> tmpQual;
		if(p.fileParallel) {
			// Feed query files one to each PatternSource
			qs = &tmpSeq;
			tmpSeq.push_back(m2[i]);
			assert_eq(1, tmpSeq.size());
		}
		b->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads));
		if(!p.fileParallel) {
			break;
		}
	}
	// All mates/mate files must be paired
	assert_eq(a->size(), b->size());

	// Create list of pattern sources for the unpaired reads
	for(size_t i = 0; i < si.size(); i++) {
		const EList<string>* qs = &si;
		PatternSource* patsrc = NULL;
		EList<string> tmpSeq;
		EList<string> tmpQual;
		if(p.fileParallel) {
			// Feed query files one to each PatternSource
			qs = &tmpSeq;
			tmpSeq.push_back(si[i]);
			assert_eq(1, tmpSeq.size());
		}
		patsrc = PatternSource::patsrcFromStrings(p, *qs, nthreads);
		assert(patsrc != NULL);
		a->push_back(patsrc);
		b->push_back(NULL);
		if(!p.fileParallel) {
			break;
		}
	}

	PairedPatternSource *patsrc = NULL;
#ifdef USE_SRA
	if(m12.size() > 0 || sra_accs.size() > 0) {
#else
    if(m12.size() > 0) {
#endif
		patsrc = new PairedSoloPatternSource(ab, p);
		for(size_t i = 0; i < a->size(); i++) delete (*a)[i];
		for(size_t i = 0; i < b->size(); i++) delete (*b)[i];
		delete a; delete b;
	} else {
		patsrc = new PairedDualPatternSource(a, b, p);
		for(size_t i = 0; i < ab->size(); i++) delete (*ab)[i];
		delete ab;
	}
	return patsrc;
}

VectorPatternSource::VectorPatternSource(
	const EList<string>& v,
	const PatternParams& p) :
	PatternSource(p),
	cur_(p.skip),
	skip_(p.skip),
	paired_(false),
	v_(),
	quals_()
{
	for(size_t i = 0; i < v.size(); i++) {
		EList<string> ss;
		tokenize(v[i], ":", ss, 2);
		assert_gt(ss.size(), 0);
		assert_leq(ss.size(), 2);
		// Initialize s
		string s = ss[0];
		int mytrim5 = gTrim5;
		if(gColor && s.length() > 1) {
			// This may be a primer character.  If so, keep it in the
			// 'primer' field of the read buf and parse the rest of the
			// read without it.
			int c = toupper(s[0]);
			if(asc2dnacat[c] > 0) {
				// First char is a DNA char
				int c2 = toupper(s[1]);
				// Second char is a color char
				if(asc2colcat[c2] > 0) {
					mytrim5 += 2; // trim primer and first color
				}
			}
		}
		if(gColor) {
			// Convert '0'-'3' to 'A'-'T'
			for(size_t i = 0; i < s.length(); i++) {
				if(s[i] >= '0' && s[i] <= '4') {
					s[i] = "ACGTN"[(int)s[i] - '0'];
				}
				if(s[i] == '.') s[i] = 'N';
			}
		}
		if(s.length() <= (size_t)(gTrim3 + mytrim5)) {
			// Entire read is trimmed away
			s.clear();
		} else {
			// Trim on 5' (high-quality) end
			if(mytrim5 > 0) {
				s.erase(0, mytrim5);
			}
			// Trim on 3' (low-quality) end
			if(gTrim3 > 0) {
				s.erase(s.length()-gTrim3);
			}
		}
		//  Initialize vq
		string vq;
		if(ss.size() == 2) {
			vq = ss[1];
		}
		// Trim qualities
		if(vq.length() > (size_t)(gTrim3 + mytrim5)) {
			// Trim on 5' (high-quality) end
			if(mytrim5 > 0) {
				vq.erase(0, mytrim5);
			}
			// Trim on 3' (low-quality) end
			if(gTrim3 > 0) {
				vq.erase(vq.length()-gTrim3);
			}
		}
		// Pad quals with Is if necessary; this shouldn't happen
		while(vq.length() < s.length()) {
			vq.push_back('I');
		}
		// Truncate quals to match length of read if necessary;
		// this shouldn't happen
		if(vq.length() > s.length()) {
			vq.erase(s.length());
		}
		assert_eq(vq.length(), s.length());
		v_.expand();
		v_.back().installChars(s);
		quals_.push_back(BTString(vq));
		trimmed3_.push_back(gTrim3);
		trimmed5_.push_back(mytrim5);
		ostringstream os;
		os << (names_.size());
		names_.push_back(BTString(os.str()));
	}
	assert_eq(v_.size(), quals_.size());
}
	
bool VectorPatternSource::nextReadImpl(
	Read& r,
	TReadId& rdid,
	TReadId& endid,
	bool& success,
	bool& done)
{
	// Let Strings begin at the beginning of the respective bufs
	r.reset();
	lock();
	if(cur_ >= v_.size()) {
		unlock();
		// Clear all the Strings, as a signal to the caller that
		// we're out of reads
		r.reset();
		success = false;
		done = true;
		assert(r.empty());
		return false;
	}
	// Copy v_*, quals_* strings into the respective Strings
	r.color = gColor;
	r.patFw  = v_[cur_];
	r.qual = quals_[cur_];
	r.trimmed3 = trimmed3_[cur_];
	r.trimmed5 = trimmed5_[cur_];
	ostringstream os;
	os << cur_;
	r.name = os.str();
	cur_++;
	done = cur_ == v_.size();
	rdid = endid = readCnt_;
	readCnt_++;
	unlock();
	success = true;
	return true;
}
Пример #12
0
/**
 * TODO: Argument parsing is very, very flawed.  The biggest problem is that
 * there are two separate worlds of arguments, the ones set via polstr, and
 * the ones set directly in variables.  This makes for nasty interactions,
 * e.g., with the -M option being resolved at an awkward time relative to
 * the -k and -a options.
 */
static void parseOption(int next_option, const char *arg) {
	switch (next_option) {
		case 's':
			skipReads = (uint32_t)parseInt(0, "-s arg must be positive", arg);
			break;
		case ARG_GAP_BAR:
			gGapBarrier = parseInt(1, "--gbar must be no less than 1", arg);
			break;
		case 'u':
			qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1", arg);
			break;
		case 'p':
			nthreads = parseInt(1, "-p/--threads arg must be at least 1", arg);
			break;
		case 'h': printUsage(cout); throw 0; break;
		case ARG_USAGE: printUsage(cout); throw 0; break;
		case ARG_VERBOSE: gVerbose = 1; break;
		case ARG_QUIET: gQuiet = true; break;
		case ARG_SANITY: sanityCheck = true; break;
		case ARG_CP_MIN:
			cminlen = parse<size_t>(arg);
			break;
		case ARG_CP_IVAL:
			cpow2 = parse<size_t>(arg);
			break;
		case ARG_TRI:
			doTri = true;
			break;
		case ARG_LOCAL: localAlign = true; break;
		case ARG_END_TO_END: localAlign = false; break;
		case ARG_SSE8: enable8 = true; break;
		case ARG_SSE8_NO: enable8 = false; break;
		case ARG_IGNORE_QUALS: ignoreQuals = true; break;
		case ARG_N_CEIL: {
			// Split argument by comma
			EList<string> args;
			tokenize(arg, ",", args);
			if(args.size() > 3) {
				cerr << "Error: expected 3 or fewer comma-separated "
					 << "arguments to --n-ceil option, got "
					 << args.size() << endl;
				throw 1;
			}
			if(args.size() == 0) {
				cerr << "Error: expected at least one argument to --n-ceil option" << endl;
				throw 1;
			}
			PARSE_FUNC(nCeil);
			break;
		}
		case ARG_SCORE_MA: {
			// Split argument by comma
			EList<string> args;
			tokenize(arg, ",", args);
			if(args.size() != 1) {
				cerr << "Error parsing --ma; RHS must have 1 token" << endl;
				assert(false); throw 1;
			}
			string tmp = args[0];
			istringstream tmpss(tmp);
			tmpss >> bonusMatch;
			break;
		}
		case ARG_SCORE_MMP: {
			// Split argument by comma
			EList<string> args;
			tokenize(arg, ",", args);
			if(args.size() > 3) {
				cerr << "Error parsing --mmp "
				     << "; RHS must have at most 3 tokens" << endl;
				assert(false); throw 1;
			}
			if(args[0][0] == 'C') {
				string tmp = args[0].substr(1);
				// Parse constant penalty
				istringstream tmpss(tmp);
				tmpss >> penMmcMax;
				penMmcMin = penMmcMax;
				// Parse constant penalty
				penMmcType = COST_MODEL_CONSTANT;
			} else if(args[0][0] == 'Q') {
				if(args.size() >= 2) {
					string tmp = args[1];
					istringstream tmpss(tmp);
					tmpss >> penMmcMax;
				} else {
Пример #13
0
static void print_index_sequences(ostream& fout, Ebwt<index_t>& ebwt)
{
	EList<string>* refnames = &(ebwt.refnames());

	TStr cat_ref;
	ebwt.restore(cat_ref);

	HyperLogLogPlusMinus<uint64_t> kmer_counter;
	TIndexOffU curr_ref = OFF_MASK;
	string curr_ref_seq = "";
	TIndexOffU curr_ref_len = OFF_MASK;
	TIndexOffU last_text_off = 0;
	size_t orig_len = cat_ref.length();
	TIndexOffU tlen = OFF_MASK;
	bool first = true;
	for(size_t i = 0; i < orig_len; i++) {
		TIndexOffU tidx = OFF_MASK;
		TIndexOffU textoff = OFF_MASK;
		tlen = OFF_MASK;
		bool straddled = false;
		ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen, true, straddled);

		if (tidx != OFF_MASK && textoff < tlen)
		{
			if (curr_ref != tidx)
			{
				if (curr_ref != OFF_MASK)
				{
					// Add trailing gaps, if any exist
					if(curr_ref_seq.length() < curr_ref_len) {
						curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
					}
					print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
				}
				curr_ref = tidx;
				curr_ref_seq = "";
				curr_ref_len = tlen;
				last_text_off = 0;
				first = true;
			}

			TIndexOffU textoff_adj = textoff;
			if(first && textoff > 0) textoff_adj++;
			if (textoff_adj - last_text_off > 1)
				curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N');

            curr_ref_seq.push_back("ACGT"[int(cat_ref[i])]);			
			last_text_off = textoff;
			first = false;
		}
	}
	if (curr_ref < refnames->size())
	{
		// Add trailing gaps, if any exist
		if(curr_ref_seq.length() < curr_ref_len) {
			curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
		}
		print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
	}

}
Пример #14
0
static void driver(
                   const string& infile,
                   EList<string>& infiles,
                   const string& snpfile,
                   const string& htfile,
                   const string& ssfile,
                   const string& exonfile,
                   const string& svfile,
                   const string& outfile,
                   bool packed,
                   int reverse)
{
    initializeCntLut();
    initializeCntBit();
	EList<FileBuf*> is(MISC_CAT);
	bool bisulfite = false;
	RefReadInParams refparams(false, reverse, nsToAs, bisulfite);
	assert_gt(infiles.size(), 0);
	if(format == CMDLINE) {
		// Adapt sequence strings to stringstreams open for input
		stringstream *ss = new stringstream();
		for(size_t i = 0; i < infiles.size(); i++) {
			(*ss) << ">" << i << endl << infiles[i].c_str() << endl;
		}
		FileBuf *fb = new FileBuf(ss);
		assert(fb != NULL);
		assert(!fb->eof());
		assert(fb->get() == '>');
		ASSERT_ONLY(fb->reset());
		assert(!fb->eof());
		is.push_back(fb);
	} else {
		// Adapt sequence files to ifstreams
		for(size_t i = 0; i < infiles.size(); i++) {
			FILE *f = fopen(infiles[i].c_str(), "r");
			if (f == NULL) {
				cerr << "Error: could not open "<< infiles[i].c_str() << endl;
				throw 1;
			}
			FileBuf *fb = new FileBuf(f);
			assert(fb != NULL);
			if(fb->peek() == -1 || fb->eof()) {
				cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl;
				continue;
			}
			assert(!fb->eof());
			assert(fb->get() == '>');
			ASSERT_ONLY(fb->reset());
			assert(!fb->eof());
			is.push_back(fb);
		}
	}
	if(is.empty()) {
		cerr << "Warning: All fasta inputs were empty" << endl;
		throw 1;
	}
    filesWritten.push_back(outfile + ".1." + gfm_ext);
    filesWritten.push_back(outfile + ".2." + gfm_ext);
	// Vector for the ordered list of "records" comprising the input
	// sequences.  A record represents a stretch of unambiguous
	// characters in one of the input sequences.
	EList<RefRecord> szs(MISC_CAT);
	std::pair<size_t, size_t> sztot;
	{
		if(verbose) cerr << "Reading reference sizes" << endl;
		Timer _t(cerr, "  Time reading reference sizes: ", verbose);
		if(!reverse && (writeRef || justRef)) {
			filesWritten.push_back(outfile + ".3." + gfm_ext);
			filesWritten.push_back(outfile + ".4." + gfm_ext);
			sztot = BitPairReference::szsFromFasta(is, outfile, bigEndian, refparams, szs, sanityCheck);
		} else {
			sztot = BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck);
		}
	}
	if(justRef) return;
	assert_gt(sztot.first, 0);
	assert_gt(sztot.second, 0);
	assert_gt(szs.size(), 0);
    
	// Construct index from input strings and parameters	
    filesWritten.push_back(outfile + ".5." + gfm_ext);
    filesWritten.push_back(outfile + ".6." + gfm_ext);
    filesWritten.push_back(outfile + ".7." + gfm_ext);
    filesWritten.push_back(outfile + ".8." + gfm_ext);
	TStr s;
	HGFM<TIndexOffU> hGFM(
                          s,
                          packed,
                          1,  // TODO: maybe not?
                          lineRate,
                          offRate,      // suffix-array sampling rate
                          ftabChars,    // number of chars in initial arrow-pair calc
                          localOffRate,
                          localFtabChars,
                          nthreads,
                          snpfile,
                          htfile,
                          ssfile,
                          exonfile,
                          svfile,
                          outfile,      // basename for .?.ht2 files
                          reverse == 0, // fw
                          !entireSA,    // useBlockwise
                          bmax,         // block size for blockwise SA builder
                          bmaxMultSqrt, // block size as multiplier of sqrt(len)
                          bmaxDivN,     // block size as divisor of len
                          noDc? 0 : dcv,// difference-cover period
                          is,           // list of input streams
                          szs,          // list of reference sizes
                          (TIndexOffU)sztot.first,  // total size of all unambiguous ref chars
                          refparams,    // reference read-in parameters
                          seed,         // pseudo-random number generator seed
                          -1,           // override offRate
                          verbose,      // be talkative
                          autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
                          sanityCheck); // verify results and internal consistency
    // Note that the Ebwt is *not* resident in memory at this time.  To
    // load it into memory, call ebwt.loadIntoMemory()
	if(verbose) {
		// Print Ebwt's vital stats
		hGFM.gh().print(cerr);
	}
	if(sanityCheck) {
		// Try restoring the original string (if there were
		// multiple texts, what we'll get back is the joined,
		// padded string, not a list)
		hGFM.loadIntoMemory(
                            reverse ? (refparams.reverse == REF_READ_REVERSE) : 0,
                            true,  // load SA sample?
                            true,  // load ftab?
                            true,  // load rstarts?
                            false,
                            false);
		SString<char> s2;
		hGFM.restore(s2);
		hGFM.evictFromMemory();
		{
			SString<char> joinedss = GFM<>::join<SString<char> >(
				is,          // list of input streams
				szs,         // list of reference sizes
				(TIndexOffU)sztot.first, // total size of all unambiguous ref chars
				refparams,   // reference read-in parameters
				seed);       // pseudo-random number generator seed
			if(refparams.reverse == REF_READ_REVERSE) {
				joinedss.reverse();
			}
			assert_eq(joinedss.length(), s2.length());
			assert(sstr_eq(joinedss, s2));
		}
		if(verbose) {
			if(s2.length() < 1000) {
				cout << "Passed restore check: " << s2.toZBuf() << endl;
			} else {
				cout << "Passed restore check: (" << s2.length() << " chars)" << endl;
			}
		}
	}
}
Пример #15
0
/**
 * Merman main driver function.  Does the following:
 *
 * 1. Parses command-line options
 */
int merman(int argc, char **argv) {
	reset();
	try {
		parseCommandLine(argc, argv);
		Timer tov(cerr, "Overall time: ", timing);
		EList<string> refstrs;
		ReferenceSet refs;
		EList<string> refnames;
		EList<size_t> reflens;
		string refstr = argv[optind++];
		tokenize(refstr, ",", refstrs);
		auto_ptr<MerIndex> ind(
			new MerIndex(ap, rp, readLen, seedWidth, nk.first, nk.second,
			             specificity, begin, naiveCheck, nthreads));
		{
			Timer t(cerr, "... ", timing);
			if(timing) cerr << "Reading reference sequences..." << endl;
			for(size_t i = 0; i < refstrs.size(); i++) {
				if(timing) {
					cerr << "  Sequence " << (i+1) << " of " << refstrs.size() << endl;
				}
				if(refIsStr) {
					refs.addOrigReferenceString(refstrs[i].c_str(), rp);
				} else {
					refs.addOrigReferenceFasta(refstrs[i].c_str(), rp);
				}
			}
			for(size_t i = 0; i < refs.numRefs(); i++) {
				refnames.push_back(string(refs[i].name.toZBuf()));
				reflens.push_back(refs[i].seq.length(color));
			}
			if(refs.numRefs() == 0) {
				cerr << "Warning: No references were found" << endl;
			}
			if(rp.genCrick) {
				if(timing) {
					cerr << "  Crickizing" << endl;
				}
				// Add the crick strand.  If there were bisulfite
				// transformations to the Watson strand, they are
				// removed from the Watson strand before the Crick copy
				// is made.  Transformations are then applied to the
				// new Crick strand.  This has the effect of correctly
				// producing either Watson / Crick in the non-bisulfite
				// case, or BS Watson / BS Crick in the bisulfite case.
				refs.addReferenceRevComps(rp, false, 1, 0);
			}
			if(rp.genRevcomps) {
				if(timing) {
					cerr << "  Adding reverse comps" << endl;
				}
				// Add reverse complements of all existing references
				// (after the transformations have already been
				// applied).
				refs.addReferenceRevComps(rp, true, -1, 1);
			}
			assert(refs.repOk());
		}

		pair<size_t, size_t> mers = make_pair(0, 0);
		EList<MerIndexThread> threads;
		{
			Timer t(cerr, "... ", timing);
			if(timing) cerr << "Preparing to extract sub-sequences..." << endl;
			// Instantiate and run index threads
			assert_gt(nthreads, 0);
			threads.resize(nthreads);
			for(int i = 0; i < nthreads; i++) {
				threads[i].runCount(&refs, ind.get(), i, nthreads, color);
			}
			for(int i = 0; i < nthreads; i++) {
				pair<size_t, size_t> mrs = threads[i].join();
				mers.first += mrs.first;
				mers.second += mrs.second;
			}
			ind->allocateMers();
		}
		if(timing || verbose || justBlowup) {
			cerr << "Expecting index footprint of ";
			printBytes(mers.first * sizeof(mer_ent), cerr);
			cerr << endl;
			if(mers.first > mers.second) {
				cerr.setf(ios::fixed);
				cerr << "  base footprint is ";
				printBytes(mers.second * sizeof(mer_ent), cerr);
				cerr << endl
				     << "  blowup factor: " << setprecision(2) << ((double)mers.first / (double)mers.second) << endl;
			}
			if(justBlowup) throw 0;
		}
		{
			Timer t(cerr, "... ", timing);
			if(timing) cerr << "Extracting index sub-sequences..." << endl;
			// Instantiate and run index threads
			for(int i = 0; i < nthreads; i++) {
				threads[i].runIndex(&refs, ind.get(), i, nthreads, color);
			}
			for(int i = 0; i < nthreads; i++) threads[i].join();
		}
		assert_eq(mers.first, ind->size());
		if(verbose) {
			cout << "  read " << refs.numRefs() << " reference strings" << endl;
		}
		if(refs.empty() && iformat != INPUT_CHAININ) {
			cerr << "Index is empty; not enough reference sequence supplied" << endl;
			throw 1;
		}
		if(refs.numRefs() == 0 && iformat != INPUT_CHAININ) {
			cerr << "No reference strings provided; aborting..." << endl;
			throw 1;
		}
		{
			Timer t(cerr, "Sorting reference mers: ", timing);
			ind->sort(nthreads); // sort mers
		}
		{
			Timer t(cerr, "... ", timing);
			if(timing) cerr << "Aligning reads..." << endl;
			string rstr = argv[optind++];
			// Instantiate reference map, which translates to new reference
			// coordinate system prior to alignment output
			auto_ptr<ReferenceMap> rmap(
				refmapFile == NULL ? NULL : new ReferenceMap(refmapFile, !refidx));
			// Instantiate annotation map, which encodes SNP locations & alleles
			auto_ptr<AnnotationMap> amap(
				annotFile == NULL ? NULL : new AnnotationMap(annotFile));
			// Instantiate the read-input object
			auto_ptr<Reads> rs(
				(iformat == INPUT_CMDLINE) ?
					(Reads*)new StringReads(rstr, begin) :
					((iformat == INPUT_FASTA) ?
						(Reads*)new FastaReads(rstr, begin, bufsz) :
						((iformat == INPUT_FASTA_CONT) ?
							(Reads*)new FastaContinuousReads(
								rstr, begin, fastaContLen,
								fastaContFreq, fcontBis, fcontRc,
								color) :
							((iformat == INPUT_FASTQ) ?
								(Reads*)new FastqReads(rstr, solexaScale, sixty4off, begin, bufsz) :
									((iformat == INPUT_CHAININ) ?
										(Reads*)new ChainReads(rstr, begin, bufsz) :
											((iformat == INPUT_CSFASTA) ?
												(Reads*)new CSFastaReads(rstr, begin, bufsz) :
													((iformat == INPUT_CSFASTA_AND_QV) ?
														(Reads*)new CSFastaAndQVReads(rstr, qualFile, begin, bufsz) :
														(Reads*)new CSFastqReads(rstr, solexaScale, sixty4off, begin, bufsz))))))));
			// Set output stream
			string of = "-";
			if(optind < argc) of = argv[optind++];
			// Instantiate the alignment-output object
			auto_ptr<AlignOutput> outs(
				(oformat == OUTPUT_SAM) ?
					(AlignOutput*)new SamOutput(of, fullref, refidx, rp.bisulfiteC || rp.bisulfiteCpG, !samNoCsCq) :
					(AlignOutput*)new BowtieOutput(of, fullref, printCost, refidx, rp.bisulfiteC || rp.bisulfiteCpG));
			outs->printHeader(refnames, reflens);
			// Run the progress thread, if requested
			ProgressThread proThread;
			if(progress) proThread.run();
			// Instantiate and run search threads
			EList<SearchThread> sthreads;
			sthreads.resize(nthreads);
			for(int i = 0; i < (int)sthreads.size(); i++) {
				sthreads[i].init(
					i, (int)sthreads.size(), ind.get(), rs.get(), &refs,
					outs.get(), rmap.get(), amap.get());
				sthreads[i].run();
			}
			// Wait until search sthreads are finished
			for(size_t i = 0; i < sthreads.size(); i++) {
				sthreads[i].join();
			}
			if(progress) {
				proThread.kill();
				proThread.join();
			}
			outs->flush();
		}
		if(!quiet) ProgressThread::reportStats();
	} catch(exception& e) {
		cerr << "Command: ";
		for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
		cerr << endl;
		return 1;
	} catch(int e) {
		if(e != 0) {
			cerr << "Command: ";
			for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
			cerr << endl;
		}
		return e;
	}
	return 0;
}
Пример #16
0
VectorPatternSource::VectorPatternSource(
	const EList<string>& v,
	const PatternParams& p) :
	PatternSource(p),
	cur_(p.skip),
	skip_(p.skip),
	paired_(false),
	v_(),
	quals_()
{
	for(size_t i = 0; i < v.size(); i++) {
		EList<string> ss;
		tokenize(v[i], ":", ss, 2);
		assert_gt(ss.size(), 0);
		assert_leq(ss.size(), 2);
		// Initialize s
		string s = ss[0];
		int mytrim5 = gTrim5;
		if(gColor && s.length() > 1) {
			// This may be a primer character.  If so, keep it in the
			// 'primer' field of the read buf and parse the rest of the
			// read without it.
			int c = toupper(s[0]);
			if(asc2dnacat[c] > 0) {
				// First char is a DNA char
				int c2 = toupper(s[1]);
				// Second char is a color char
				if(asc2colcat[c2] > 0) {
					mytrim5 += 2; // trim primer and first color
				}
			}
		}
		if(gColor) {
			// Convert '0'-'3' to 'A'-'T'
			for(size_t i = 0; i < s.length(); i++) {
				if(s[i] >= '0' && s[i] <= '4') {
					s[i] = "ACGTN"[(int)s[i] - '0'];
				}
				if(s[i] == '.') s[i] = 'N';
			}
		}
		if(s.length() <= (size_t)(gTrim3 + mytrim5)) {
			// Entire read is trimmed away
			s.clear();
		} else {
			// Trim on 5' (high-quality) end
			if(mytrim5 > 0) {
				s.erase(0, mytrim5);
			}
			// Trim on 3' (low-quality) end
			if(gTrim3 > 0) {
				s.erase(s.length()-gTrim3);
			}
		}
		//  Initialize vq
		string vq;
		if(ss.size() == 2) {
			vq = ss[1];
		}
		// Trim qualities
		if(vq.length() > (size_t)(gTrim3 + mytrim5)) {
			// Trim on 5' (high-quality) end
			if(mytrim5 > 0) {
				vq.erase(0, mytrim5);
			}
			// Trim on 3' (low-quality) end
			if(gTrim3 > 0) {
				vq.erase(vq.length()-gTrim3);
			}
		}
		// Pad quals with Is if necessary; this shouldn't happen
		while(vq.length() < s.length()) {
			vq.push_back('I');
		}
		// Truncate quals to match length of read if necessary;
		// this shouldn't happen
		if(vq.length() > s.length()) {
			vq.erase(s.length());
		}
		assert_eq(vq.length(), s.length());
		v_.expand();
		v_.back().installChars(s);
		quals_.push_back(BTString(vq));
		trimmed3_.push_back(gTrim3);
		trimmed5_.push_back(mytrim5);
		ostringstream os;
		os << (names_.size());
		names_.push_back(BTString(os.str()));
	}
	assert_eq(v_.size(), quals_.size());
}
Пример #17
0
/**
 * Reverse the 'src' list of RefRecords into the 'dst' list.  Don't
 * modify 'src'.
 */
void reverseRefRecords(
	const EList<RefRecord>& src,
	EList<RefRecord>& dst,
	bool recursive,
	bool verbose)
{
	dst.clear();
	{
		EList<RefRecord> cur;
		for(int i = (int)src.size()-1; i >= 0; i--) {
			bool first = (i == (int)src.size()-1 || src[i+1].first);
			// Clause after the || on next line is to deal with empty FASTA
			// records at the end of the 'src' list, which would be wrongly
			// omitted otherwise.
			if(src[i].len || (first && src[i].off == 0)) {
				cur.push_back(RefRecord(0, src[i].len, first));
				first = false;
			}
			if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first));
		}
		bool mergedLast;
		for(int i = 0; i < (int)cur.size(); i++) {
			mergedLast = false;
			assert(cur[i].off == 0 || cur[i].len == 0);
			if(i < (int)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) {
				dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first));
				i++;
				mergedLast = true;
			} else {
				dst.push_back(cur[i]);
			}
		}
	}
	//if(verbose) {
	//	cout << "Source: " << endl;
	//	printRecords(cout, src);
	//	cout << "Dest: " << endl;
	//	printRecords(cout, dst);
	//}
#ifndef NDEBUG
	size_t srcnfirst = 0, dstnfirst = 0;
	for(size_t i = 0; i < src.size(); i++) {
		if(src[i].first) {
			srcnfirst++;
		}
	}
	for(size_t i = 0; i < dst.size(); i++) {
		if(dst[i].first) {
			dstnfirst++;
		}
	}
	assert_eq(srcnfirst, dstnfirst);
	if(!recursive) {
		EList<RefRecord> tmp;
		reverseRefRecords(dst, tmp, true);
		assert_eq(tmp.size(), src.size());
		for(size_t i = 0; i < src.size(); i++) {
			assert_eq(src[i].len, tmp[i].len);
			assert_eq(src[i].off, tmp[i].off);
			assert_eq(src[i].first, tmp[i].first);
		}
	}
#endif
}
Пример #18
0
/**
 * For now, we pretend that the alignment is in the forward orientation
 * and that the Edits are listed from left- to right-hand side.
 */
void Edit::printQAlignNoCheck(
	std::ostream& os,
	const char *prefix,
	const BTDnaString& read,
	const EList<Edit>& edits)
{
	size_t eidx = 0;
	os << prefix;
	// Print read
	for(size_t i = 0; i < read.length(); i++) {
		bool del = false, mm = false;
		while(eidx < edits.size() && edits[eidx].pos == i) {
			if(edits[eidx].isReadGap()) {
				os << '-';
			} else if(edits[eidx].isRefGap()) {
				del = true;
				os << read.toChar(i);
			} else {
				mm = true;
				os << (char)edits[eidx].qchr;
			}
			eidx++;
		}
		if(!del && !mm) os << read.toChar(i);
	}
	os << endl;
	os << prefix;
	eidx = 0;
	// Print match bars
	for(size_t i = 0; i < read.length(); i++) {
		bool del = false, mm = false;
		while(eidx < edits.size() && edits[eidx].pos == i) {
			if(edits[eidx].isReadGap()) {
				os << ' ';
			} else if(edits[eidx].isRefGap()) {
				del = true;
				os << ' ';
			} else {
				mm = true;
				os << ' ';
			}
			eidx++;
		}
		if(!del && !mm) os << '|';
	}
	os << endl;
	os << prefix;
	eidx = 0;
	// Print reference
	for(size_t i = 0; i < read.length(); i++) {
		bool del = false, mm = false;
		while(eidx < edits.size() && edits[eidx].pos == i) {
			if(edits[eidx].isReadGap()) {
				os << (char)edits[eidx].chr;
			} else if(edits[eidx].isRefGap()) {
				del = true;
				os << '-';
			} else {
				mm = true;
				os << (char)edits[eidx].chr;
			}
			eidx++;
		}
		if(!del && !mm) os << read.toChar(i);
	}
	os << endl;
}
Пример #19
0
/**
 * Print a list of edits to a std::ostream, separated by commas.
 */
void Edit::print(ostream& os, const EList<Edit>& edits, char delim) {
	for(size_t i = 0; i < edits.size(); i++) {
		os << edits[i];
		if(i < edits.size()-1) os << delim;
	}
}
Пример #20
0
/**
 * Given a read string and some edits, generate and append the corresponding
 * reference string to 'ref'.  If read aligned to the Watson strand, the caller
 * should pass the original read sequence and original edits.  If a read
 * aligned to the Crick strand, the caller should pass the reverse complement
 * of the read and a version of the edits list that has had Edit:invertPoss
 * called on it to cause edits to be listed in 3'-to-5' order.
 */
void Edit::toRef(
	const BTDnaString& read,
	const EList<Edit>& edits,
	BTDnaString& ref,
	bool fw,
	size_t trim5,
	size_t trim3)
{
	// edits should be sorted
	size_t eidx = 0;
	// Print reference
	const size_t rdlen = read.length();
	size_t trimBeg = fw ? trim5 : trim3;
	size_t trimEnd = fw ? trim3 : trim5;
	assert(Edit::repOk(edits, read, fw, trim5, trim3));
	if(!fw) {
		invertPoss(const_cast<EList<Edit>&>(edits), read.length()-trimBeg-trimEnd, false);
	}
	for(size_t i = 0; i < rdlen; i++) {
		ASSERT_ONLY(int c = read[i]);
		assert_range(0, 4, c);
		bool del = false, mm = false;
		bool append = i >= trimBeg && rdlen - i - 1 >= trimEnd;
		bool appendIns = i >= trimBeg && rdlen - i >= trimEnd;
		while(eidx < edits.size() && edits[eidx].pos+trimBeg == i) {
			if(edits[eidx].isReadGap()) {
				// Inserted characters come before the position's
				// character
				if(appendIns) {
					ref.appendChar((char)edits[eidx].chr);
				}
			} else if(edits[eidx].isRefGap()) {
				assert_eq("ACGTN"[c], edits[eidx].qchr);
				del = true;
			} else if(edits[eidx].isMismatch()){
				mm = true;
				assert(edits[eidx].qchr != edits[eidx].chr || edits[eidx].qchr == 'N');
				assert_eq("ACGTN"[c], edits[eidx].qchr);
				if(append) {
					ref.appendChar((char)edits[eidx].chr);
				}
			}
			eidx++;
		}
		if(!del && !mm) {
			if(append) {
				ref.append(read[i]);
			}
		}
	}
	if(trimEnd == 0) {
		while(eidx < edits.size()) {
			assert_gt(rdlen, edits[eidx].pos);
			if(edits[eidx].isReadGap()) {
				ref.appendChar((char)edits[eidx].chr);
			}
			eidx++;
		}
	}
	if(!fw) {
		invertPoss(const_cast<EList<Edit>&>(edits), read.length()-trimBeg-trimEnd, false);
	}
}
Пример #21
0
/**
 * Given the values for all of the various arguments used to specify
 * the read and quality input, create a list of pattern sources to
 * dispense them.
 */
PairedPatternSource* PairedPatternSource::setupPatternSources(
	const EList<string>& si,   // singles, from argv
	const EList<string>& m1,   // mate1's, from -1 arg
	const EList<string>& m2,   // mate2's, from -2 arg
	const EList<string>& m12,  // both mates on each line, from --12 arg
	const EList<string>& q,    // qualities associated with singles
	const EList<string>& q1,   // qualities associated with m1
	const EList<string>& q2,   // qualities associated with m2
	const PatternParams& p,    // read-in parameters
	bool verbose)              // be talkative?
{
	//std::cout << "setupPatternSources\n";
	EList<PatternSource*>* a  = new EList<PatternSource*>();
	EList<PatternSource*>* b  = new EList<PatternSource*>();
	EList<PatternSource*>* ab = new EList<PatternSource*>();
	// Create list of pattern sources for paired reads appearing
	// interleaved in a single file
	for(size_t i = 0; i < m12.size(); i++) {
		const EList<string>* qs = &m12;
		EList<string> tmp;
		if(p.fileParallel) {
			// Feed query files one to each PatternSource
			qs = &tmp;
			tmp.push_back(m12[i]);
			assert_eq(1, tmp.size());
		}
		ab->push_back(PatternSource::patsrcFromStrings(p, *qs));
		if(!p.fileParallel) {
			break;
		}
	}

	// Create list of pattern sources for paired reads
	for(size_t i = 0; i < m1.size(); i++) {
		const EList<string>* qs = &m1;
		EList<string> tmpSeq;
		EList<string> tmpQual;
		if(p.fileParallel) {
			// Feed query files one to each PatternSource
			qs = &tmpSeq;
			tmpSeq.push_back(m1[i]);
			assert_eq(1, tmpSeq.size());
		}
		a->push_back(PatternSource::patsrcFromStrings(p, *qs));
		if(!p.fileParallel) {
			break;
		}
	}

	// Create list of pattern sources for paired reads
	for(size_t i = 0; i < m2.size(); i++) {
		const EList<string>* qs = &m2;
		EList<string> tmpSeq;
		EList<string> tmpQual;
		if(p.fileParallel) {
			// Feed query files one to each PatternSource
			qs = &tmpSeq;
			tmpSeq.push_back(m2[i]);
			assert_eq(1, tmpSeq.size());
		}
		b->push_back(PatternSource::patsrcFromStrings(p, *qs));
		if(!p.fileParallel) {
			break;
		}
	}
	// All mates/mate files must be paired
	assert_eq(a->size(), b->size());

	// Create list of pattern sources for the unpaired reads
	for(size_t i = 0; i < si.size(); i++) {
		const EList<string>* qs = &si;
		PatternSource* patsrc = NULL;
		EList<string> tmpSeq;
		EList<string> tmpQual;
		if(p.fileParallel) {
			// Feed query files one to each PatternSource
			qs = &tmpSeq;
			tmpSeq.push_back(si[i]);
			assert_eq(1, tmpSeq.size());
		}
		patsrc = PatternSource::patsrcFromStrings(p, *qs);
		assert(patsrc != NULL);
		a->push_back(patsrc);
		b->push_back(NULL);
		if(!p.fileParallel) {
			break;
		}
	}

	PairedPatternSource *patsrc = NULL;
	if(m12.size() > 0) {
		patsrc = new PairedSoloPatternSource(ab, p);
		for(size_t i = 0; i < a->size(); i++) delete (*a)[i];
		for(size_t i = 0; i < b->size(); i++) delete (*b)[i];
		delete a; delete b;
	} else {
		patsrc = new PairedDualPatternSource(a, b, p);
		for(size_t i = 0; i < ab->size(); i++) delete (*ab)[i];
		delete ab;
	}
	return patsrc;
}
Пример #22
0
static void
printRecords(ostream& os, const EList<RefRecord>& l) {
	for(size_t i = 0; i < l.size(); i++) {
		os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl;
	}
}